From 9bcea110683dfbb3be1689e301f7ccf9dee2b2b1 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 19 Mar 2026 18:51:09 +0100 Subject: [PATCH] hardened Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 16 +- Documentation/admin-guide/sysctl/kernel.rst | 22 ++ Documentation/networking/ip-sysctl.rst | 18 + arch/Kconfig | 5 +- arch/arm64/Kconfig | 2 + arch/arm64/configs/defconfig | 1 - arch/arm64/include/asm/elf.h | 14 +- arch/x86/Kconfig | 8 +- arch/x86/configs/x86_64_defconfig | 1 - arch/x86/include/asm/elf.h | 14 +- arch/x86/include/asm/tlbflush.h | 1 + arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/process.c | 5 +- arch/x86/mm/init_32.c | 4 +- arch/x86/mm/init_64.c | 4 +- drivers/ata/libata-core.c | 1 + drivers/char/Kconfig | 2 - drivers/tty/Kconfig | 2 - drivers/tty/tty_io.c | 18 + drivers/usb/core/Makefile | 1 + drivers/usb/core/hub.c | 6 + drivers/usb/core/sysctl.c | 35 ++ drivers/usb/core/usb.c | 9 + fs/inode.c | 13 + fs/namei.c | 8 +- fs/nfs/Kconfig | 1 - fs/overlayfs/Kconfig | 16 + fs/overlayfs/super.c | 2 + fs/proc/Kconfig | 1 - fs/proc/proc_sysctl.c | 1 + fs/stat.c | 23 +- include/linux/cache.h | 2 + include/linux/capability.h | 5 + include/linux/fs.h | 11 + include/linux/fsnotify.h | 3 + include/linux/highmem.h | 7 + include/linux/interrupt.h | 2 +- include/linux/kobject_ns.h | 2 +- include/linux/perf_event.h | 8 + include/linux/sysctl.h | 2 + include/linux/tty.h | 2 + include/linux/usb.h | 11 + include/linux/user_namespace.h | 4 + include/net/tcp.h | 1 + init/Kconfig | 11 +- io_uring/io_uring.c | 5 +- kernel/audit.c | 3 + kernel/bpf/core.c | 2 +- kernel/capability.c | 6 + kernel/events/core.c | 7 +- kernel/fork.c | 39 +-- kernel/printk/sysctl.c | 9 - kernel/softirq.c | 4 +- kernel/sysctl.c | 51 +++ kernel/user_namespace.c | 7 + lib/Kconfig.debug | 9 +- lib/Kconfig.kfence | 9 + lib/kobject.c | 4 +- lib/nlattr.c | 2 + lib/vsprintf.c | 2 +- mm/Kconfig | 26 +- mm/Kconfig.debug | 3 +- mm/internal.h | 3 + mm/kfence/report.c | 5 + mm/mm_init.c | 3 + mm/mmap.c | 7 + mm/page_alloc.c | 34 ++ mm/slab.h | 22 +- mm/slab_common.c | 6 +- mm/slub.c | 327 +++++++++++++++--- mm/util.c | 4 +- mm/vma_exec.c | 3 + net/ipv4/Kconfig | 24 ++ net/ipv4/sysctl_net_ipv4.c | 9 + net/ipv4/tcp_input.c | 3 +- scripts/Makefile.modpost | 1 + scripts/gcc-plugins/Kconfig | 5 + scripts/mod/modpost.c | 33 +- security/Kconfig | 28 +- security/Kconfig.hardening | 21 ++ security/selinux/Kconfig | 2 +- security/selinux/hooks.c | 12 - security/selinux/selinuxfs.c | 10 +- security/yama/Kconfig | 2 +- tools/perf/Documentation/security.txt | 1 + tools/perf/util/evsel.c | 1 + 86 files changed, 890 insertions(+), 190 deletions(-) create mode 100644 drivers/usb/core/sysctl.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a3e9684f63c0..bf0ee03313cb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -775,17 +775,6 @@ Kernel parameters Format: { "0" | "1" } Default: 0 (1 if CONFIG_DEBUG_VM is set) - checkreqprot= [SELINUX] Set initial checkreqprot flag value. - Format: { "0" | "1" } - See security/selinux/Kconfig help text. - 0 -- check protection applied by kernel (includes - any implied execute protection). - 1 -- check protection requested by application. - Default value is set via a kernel config option. - Value can be changed at runtime via - /sys/fs/selinux/checkreqprot. - Setting checkreqprot to 1 is deprecated. - cio_ignore= [S390] See Documentation/arch/s390/common_io.rst for details. @@ -4954,6 +4943,11 @@ Kernel parameters the specified number of seconds. This is to be used if your oopses keep scrolling off the screen. + extra_latent_entropy + Enable a very simple form of latent entropy extraction + from the first 4GB of memory as the bootmem allocator + passes the memory pages to the buddy allocator. + pcbit= [HW,ISDN] pci=option[,option...] [PCI,EARLY] various PCI subsystem options. diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 239da22c4e28..c31be9b6992e 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1011,6 +1011,8 @@ with respect to CAP_PERFMON use cases. >=1 Disallow CPU event access by users without ``CAP_PERFMON``. >=2 Disallow kernel profiling by users without ``CAP_PERFMON``. + +>=3 Disallow use of any event by users without ``CAP_PERFMON``. === ================================================================== @@ -1599,6 +1601,26 @@ allow them to remain in low power states longer. Default is set (1). +tiocsti_restrict +================ + +This toggle indicates whether unprivileged users are prevented from using the +``TIOCSTI`` ioctl to inject commands into other processes which share a tty +session. + += ============================================================================ +0 No restriction, except the default one of only being able to inject commands + into one's own tty. +1 Users must have ``CAP_SYS_ADMIN`` to use the ``TIOCSTI`` ioctl. += ============================================================================ + +When user namespaces are in use, the check for ``CAP_SYS_ADMIN`` is done +against the user namespace that originally opened the tty. + +The kernel config option ``CONFIG_SECURITY_TIOCSTI_RESTRICT`` sets the default +value of ``tiocsti_restrict``. + + traceoff_on_warning =================== diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 2c65d57103fb..3ed58473d257 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -907,6 +907,24 @@ tcp_backlog_ack_defer - BOOLEAN Default: 1 (enabled) +tcp_simult_connect - BOOLEAN + Enable TCP simultaneous connect that adds a weakness in Linux's strict + implementation of TCP that allows two clients to connect to each other + without either entering a listening state. The weakness allows an attacker + to easily prevent a client from connecting to a known server provided the + source port for the connection is guessed correctly. + + As the weakness could be used to prevent an antivirus or IPS from fetching + updates, or prevent an SSL gateway from fetching a CRL, it should be + eliminated by disabling this option. Though Linux is one of few operating + systems supporting simultaneous connect, it has no legitimate use in + practice and is rarely supported by firewalls. + + Disabling this may break TCP STUNT which is used by some applications for + NAT traversal. + + Default: Value of CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON + tcp_slow_start_after_idle - BOOLEAN If enabled, provide RFC2861 behavior and time out the congestion window after an idle period. An idle period is defined at diff --git a/arch/Kconfig b/arch/Kconfig index 5c54b2beb2d6..2615d8355358 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1225,7 +1225,7 @@ config ARCH_MMAP_RND_BITS int "Number of bits to use for ASLR of mmap base address" if EXPERT range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT - default ARCH_MMAP_RND_BITS_MIN + default ARCH_MMAP_RND_BITS_MAX depends on HAVE_ARCH_MMAP_RND_BITS help This value can be used to select the number of bits to use to @@ -1259,7 +1259,7 @@ config ARCH_MMAP_RND_COMPAT_BITS int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT - default ARCH_MMAP_RND_COMPAT_BITS_MIN + default ARCH_MMAP_RND_COMPAT_BITS_MAX depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS help This value can be used to select the number of bits to use to @@ -1562,6 +1562,7 @@ config RANDOMIZE_KSTACK_OFFSET config RANDOMIZE_KSTACK_OFFSET_DEFAULT bool "Default state of kernel stack offset randomization" depends on RANDOMIZE_KSTACK_OFFSET + default y help Kernel stack offset randomization is controlled by kernel boot param "randomize_kstack_offset=on/off", and this config chooses the default diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 93173f0a09c7..3f14a185ec87 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1681,6 +1681,7 @@ config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" depends on !KCSAN select ARM64_PAN + default y help Enabling this option prevents the kernel from accessing user-space memory directly by pointing TTBR0_EL1 to a reserved @@ -2310,6 +2311,7 @@ config RELOCATABLE config RANDOMIZE_BASE bool "Randomize the address of the kernel image" select RELOCATABLE + default y help Randomizes the virtual address at which the kernel image is loaded, as a security feature that deters exploit attempts diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 35e9eb180c9a..6359bc2e9471 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1,4 +1,3 @@ -CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_AUDIT=y CONFIG_NO_HZ_IDLE=y diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index d2779d604c7b..ed0d158955e2 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -124,14 +124,10 @@ /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is above 4GB to leave the entire 32-bit address + * 64-bit, this is raised to 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ -#ifdef CONFIG_ARM64_FORCE_52BIT -#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) -#else -#define ELF_ET_DYN_BASE (2 * DEFAULT_MAP_WINDOW_64 / 3) -#endif /* CONFIG_ARM64_FORCE_52BIT */ +#define ELF_ET_DYN_BASE 0x100000000UL #ifndef __ASSEMBLER__ @@ -189,10 +185,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, /* 1GB of VA */ #ifdef CONFIG_COMPAT #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ - 0x7ff >> (PAGE_SHIFT - 12) : \ - 0x3ffff >> (PAGE_SHIFT - 12)) + ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ + ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) #else -#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) +#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) #endif #ifdef __AARCH64EB__ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e03e35a2a6ce..a664d43e6f88 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1234,8 +1234,7 @@ config VM86 default X86_LEGACY_VM86 config X86_16BIT - bool "Enable support for 16-bit segments" if EXPERT - default y + bool "Enable support for 16-bit segments" depends on MODIFY_LDT_SYSCALL help This option is required by programs like Wine to run 16-bit @@ -2224,7 +2223,7 @@ config COMPAT_VDSO choice prompt "vsyscall table for legacy applications" depends on X86_64 - default LEGACY_VSYSCALL_XONLY + default LEGACY_VSYSCALL_NONE help Legacy user code that does not know how to find the vDSO expects to be able to issue three syscalls by calling fixed addresses in @@ -2308,8 +2307,7 @@ config CMDLINE_OVERRIDE be set to 'N' under normal conditions. config MODIFY_LDT_SYSCALL - bool "Enable the LDT (local descriptor table)" if EXPERT - default y + bool "Enable the LDT (local descriptor table)" help Linux can allow user programs to install a per-process x86 Local Descriptor Table (LDT) using the modify_ldt(2) system diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 7d7310cdf8b0..21ebfefe9c90 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,5 +1,4 @@ CONFIG_WERROR=y -CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_AUDIT=y CONFIG_NO_HZ=y diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 6c8fdc96be7e..6d0abe9e5adc 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -229,11 +229,11 @@ extern int force_personality32; /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is above 4GB to leave the entire 32-bit address + * 64-bit, this is raised to 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ - (DEFAULT_MAP_WINDOW / 3 * 2)) + 0x100000000UL) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, @@ -315,8 +315,8 @@ extern unsigned long get_sigframe_size(void); #ifdef CONFIG_X86_32 -#define __STACK_RND_MASK(is32bit) (0x7ff) -#define STACK_RND_MASK (0x7ff) +#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) +#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) #define ARCH_DLINFO ARCH_DLINFO_IA32 @@ -325,7 +325,11 @@ extern unsigned long get_sigframe_size(void); #else /* CONFIG_X86_32 */ /* 1GB for 64bit, 8MB for 32bit */ -#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) +#ifdef CONFIG_COMPAT +#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) +#else +#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) +#endif #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) #define ARCH_DLINFO \ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 00daedfefc1b..a00d167fe5ce 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -485,6 +485,7 @@ static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) static inline void __native_tlb_flush_global(unsigned long cr4) { + BUG_ON(cr4 != __read_cr4()); native_write_cr4(cr4 ^ X86_CR4_PGE); native_write_cr4(cr4); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5edafdc9680f..bfc85908f42b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -481,6 +481,7 @@ EXPORT_SYMBOL_GPL(native_write_cr4); void cr4_update_irqsoff(unsigned long set, unsigned long clear) { unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + BUG_ON(cr4 != __read_cr4()); lockdep_assert_irqs_disabled(); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4c718f8adc59..63c96edc60a5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -706,6 +706,7 @@ void speculation_ctrl_update_current(void) static inline void cr4_toggle_bits_irqsoff(unsigned long mask) { unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + BUG_ON(cr4 != __read_cr4()); newval = cr4 ^ mask; if (newval != cr4) { @@ -1026,9 +1027,9 @@ unsigned long arch_align_stack(unsigned long sp) unsigned long arch_randomize_brk(struct mm_struct *mm) { if (mmap_is_ia32()) - return randomize_page(mm->brk, SZ_32M); + return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; - return randomize_page(mm->brk, SZ_1G); + return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; } /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a34fff6ab2b..172e2cd0053e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -499,9 +499,9 @@ static void __init pagetable_init(void) #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) /* Bits supported by the hardware: */ -pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; +pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK; /* Bits allowed in normal kernel mappings: */ -pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; +pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK; EXPORT_SYMBOL_GPL(__supported_pte_mask); /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ EXPORT_SYMBOL(__default_kernel_pte_mask); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9983017ecbe0..9014d40b48ef 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -104,9 +104,9 @@ static inline pgprot_t prot_sethuge(pgprot_t prot) */ /* Bits supported by the hardware: */ -pteval_t __supported_pte_mask __read_mostly = ~0; +pteval_t __supported_pte_mask __ro_after_init = ~0; /* Bits allowed in normal kernel mappings: */ -pteval_t __default_kernel_pte_mask __read_mostly = ~0; +pteval_t __default_kernel_pte_mask __ro_after_init = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ EXPORT_SYMBOL(__default_kernel_pte_mask); diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 7ee4eb94d218..ba39140b5f6b 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4914,6 +4914,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) struct ata_port *ap; struct ata_link *link; + BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ if (WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE))) return; diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index d2cfc584e202..72907b30ef13 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -310,7 +310,6 @@ config NSC_GPIO config DEVMEM bool "/dev/mem virtual device support" - default y help Say Y here if you want to support the /dev/mem device. The /dev/mem device is used to access areas of physical @@ -343,7 +342,6 @@ config NVRAM config DEVPORT bool "/dev/port character device" depends on HAS_IOPORT - default y help Say Y here if you want to support the /dev/port device. The /dev/port device is similar to /dev/mem, but for I/O ports. diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig index 149f3d53b760..a3e62d52ebcf 100644 --- a/drivers/tty/Kconfig +++ b/drivers/tty/Kconfig @@ -116,7 +116,6 @@ config UNIX98_PTYS config LEGACY_PTYS bool "Legacy (BSD) PTY support" - default y help A pseudo terminal (PTY) is a software device consisting of two halves: a master and a slave. The slave device behaves identical to @@ -146,7 +145,6 @@ config LEGACY_PTY_COUNT config LEGACY_TIOCSTI bool "Allow legacy TIOCSTI usage" - default y help Historically the kernel has allowed TIOCSTI, which will push characters into a controlling TTY. This continues to be used diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index e2d92cf70eb7..2f509955f2ec 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -171,6 +171,7 @@ static void free_tty_struct(struct tty_struct *tty) tty_ldisc_deinit(tty); put_device(tty->dev); kvfree(tty->write_buf); + put_user_ns(tty->owner_user_ns); kfree(tty); } @@ -2256,6 +2257,7 @@ static int tty_fasync(int fd, struct file *filp, int on) } static bool tty_legacy_tiocsti __read_mostly = IS_ENABLED(CONFIG_LEGACY_TIOCSTI); +static int tty_tiocsti_restrict __read_mostly = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); /** * tiocsti - fake input character * @tty: tty to fake input into @@ -2277,6 +2279,12 @@ static int tiocsti(struct tty_struct *tty, u8 __user *p) if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN)) return -EIO; + if (tty_tiocsti_restrict && + !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { + dev_warn_ratelimited(tty->dev, + "Denied TIOCSTI ioctl for non-privileged process\n"); + return -EPERM; + } if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, p)) @@ -3130,6 +3138,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) tty->index = idx; tty_line_name(driver, idx, tty->name); tty->dev = tty_get_device(tty); + tty->owner_user_ns = get_user_ns(current_user_ns()); return tty; } @@ -3620,6 +3629,15 @@ static const struct ctl_table tty_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "tiocsti_restrict", + .data = &tty_tiocsti_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; /* diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile index 60ea76160122..cb5ed42e12c2 100644 --- a/drivers/usb/core/Makefile +++ b/drivers/usb/core/Makefile @@ -15,6 +15,7 @@ usbcore-$(CONFIG_OF) += of.o usbcore-$(CONFIG_USB_XHCI_SIDEBAND) += offload.o usbcore-$(CONFIG_USB_PCI) += hcd-pci.o usbcore-$(CONFIG_ACPI) += usb-acpi.o +usbcore-$(CONFIG_SYSCTL) += sysctl.o ifdef CONFIG_USB_ONBOARD_DEV usbcore-y += ../misc/onboard_usb_dev_pdevs.o diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index be50d03034a9..bde3d79738e9 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -5448,6 +5448,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, goto done; return; } + + if (deny_new_usb) { + dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); + goto done; + } + if (hub_is_superspeed(hub->hdev)) unit_load = 150; else diff --git a/drivers/usb/core/sysctl.c b/drivers/usb/core/sysctl.c new file mode 100644 index 000000000000..813db3f0b1cb --- /dev/null +++ b/drivers/usb/core/sysctl.c @@ -0,0 +1,35 @@ +#include +#include +#include +#include +#include + +static struct ctl_table usb_sysctls[] = { + { + .procname = "deny_new_usb", + .data = &deny_new_usb, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static struct ctl_table_header *usb_sysctl_table; + +int usb_register_sysctl(void) +{ + usb_sysctl_table = register_sysctl("kernel", usb_sysctls); + if (!usb_sysctl_table) { + pr_warn("usb: sysctl registration failed\n"); + return -ENOMEM; + } + return 0; +} + +void usb_unregister_sysctl(void) +{ + unregister_sysctl_table(usb_sysctl_table); + usb_sysctl_table = NULL; +} diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index e740f7852bcd..671deccb636e 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -73,6 +73,9 @@ MODULE_PARM_DESC(autosuspend, "default autosuspend delay"); #define usb_autosuspend_delay 0 #endif +int deny_new_usb __read_mostly = 0; +EXPORT_SYMBOL(deny_new_usb); + static bool match_endpoint(struct usb_endpoint_descriptor *epd, struct usb_endpoint_descriptor **bulk_in, struct usb_endpoint_descriptor **bulk_out, @@ -1219,6 +1222,9 @@ static int __init usb_init(void) usb_debugfs_init(); usb_acpi_register(); + retval = usb_register_sysctl(); + if (retval) + goto sysctl_init_failed; retval = bus_register(&usb_bus_type); if (retval) goto bus_register_failed; @@ -1258,6 +1264,8 @@ static int __init usb_init(void) bus_notifier_failed: bus_unregister(&usb_bus_type); bus_register_failed: + usb_unregister_sysctl(); +sysctl_init_failed: usb_acpi_unregister(); usb_debugfs_cleanup(); out: @@ -1282,6 +1290,7 @@ static void __exit usb_exit(void) class_unregister(&usbmisc_class); bus_unregister_notifier(&usb_bus_type, &usb_bus_nb); bus_unregister(&usb_bus_type); + usb_unregister_sysctl(); usb_acpi_unregister(); usb_debugfs_cleanup(); idr_destroy(&usb_bus_idr); diff --git a/fs/inode.c b/fs/inode.c index 379f4c19845c..2f363ae2e3e3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -167,6 +167,10 @@ late_initcall(mg_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +/* sysctl */ +int device_sidechannel_restrict __read_mostly = 1; +EXPORT_SYMBOL(device_sidechannel_restrict); + /* * Handle nr_inode sysctl */ @@ -199,6 +203,15 @@ static const struct ctl_table inodes_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_inodes, }, + { + .procname = "device_sidechannel_restrict", + .data = &device_sidechannel_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __init init_fs_inode_sysctls(void) diff --git a/fs/namei.c b/fs/namei.c index cf16b6822dd3..239c2933faad 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1143,10 +1143,10 @@ static inline void put_link(struct nameidata *nd) path_put(&last->link); } -static int sysctl_protected_symlinks __read_mostly; -static int sysctl_protected_hardlinks __read_mostly; -static int sysctl_protected_fifos __read_mostly; -static int sysctl_protected_regular __read_mostly; +static int sysctl_protected_symlinks __read_mostly = 1; +static int sysctl_protected_hardlinks __read_mostly = 1; +static int sysctl_protected_fifos __read_mostly = 2; +static int sysctl_protected_regular __read_mostly = 2; #ifdef CONFIG_SYSCTL static const struct ctl_table namei_sysctls[] = { diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 07932ce9246c..c6754a2a6448 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -197,7 +197,6 @@ config NFS_USE_KERNEL_DNS config NFS_DEBUG bool depends on NFS_FS && SUNRPC_DEBUG - default y config NFS_DISABLE_UDP_SUPPORT bool "NFS: Disable NFS UDP protocol support" diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 2ac67e04a6fb..3340e13c959c 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -134,3 +134,19 @@ config OVERLAY_FS_DEBUG Say Y here to enable extra debugging checks in overlayfs. If unsure, say N. + +config OVERLAY_FS_UNPRIVILEGED + bool "Overlayfs: turn on unprivileged user namespace mounts" + default n + depends on OVERLAY_FS + help + When disabled, unprivileged users will not be able to create + new overlayfs mounts. This cuts the attack surface if no + unprivileged user namespace mounts are required like for + running rootless containers. + + Overlayfs has been part of several recent local privilege + escalation exploits, so if you are security-conscious + you want to disable this. + + If unsure, say N. diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ba9146f22a2c..504ec95f88bd 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1580,7 +1580,9 @@ struct file_system_type ovl_fs_type = { .name = "overlay", .init_fs_context = ovl_init_fs_context, .parameters = ovl_parameter_spec, +#ifdef CONFIG_OVERLAY_FS_UNPRIVILEGED .fs_flags = FS_USERNS_MOUNT, +#endif .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("overlay"); diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 6ae966c561e7..27d78d669f95 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -41,7 +41,6 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support" depends on PROC_FS && CRASH_DUMP - default y help Exports the dump image of crashed kernel in ELF format. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 49ab74e0bfde..4be54d32a60a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1154,6 +1154,7 @@ static int sysctl_check_table(const char *path, struct ctl_table_header *header) (entry->proc_handler == proc_douintvec) || (entry->proc_handler == proc_douintvec_minmax) || (entry->proc_handler == proc_dointvec_minmax) || + (entry->proc_handler == proc_dointvec_minmax_sysadmin) || (entry->proc_handler == proc_dou8vec_minmax) || (entry->proc_handler == proc_dointvec_jiffies) || (entry->proc_handler == proc_dointvec_userhz_jiffies) || diff --git a/fs/stat.c b/fs/stat.c index 6c79661e1b96..d02fadfdd34b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -52,7 +52,10 @@ void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) return; } - stat->mtime = inode_get_mtime(inode); + if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) + stat->mtime = inode_get_ctime(inode); + else + stat->mtime = inode_get_mtime(inode); stat->ctime.tv_sec = inode->i_ctime_sec; stat->ctime.tv_nsec = (u32)atomic_read(pcn); if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED)) @@ -84,6 +87,7 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); + bool sidechannel_device = false; stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; @@ -93,13 +97,22 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); - stat->atime = inode_get_atime(inode); + + if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) + sidechannel_device = true; + if (sidechannel_device) + stat->atime = inode_get_ctime(inode); + else + stat->atime = inode_get_atime(inode); if (is_mgtime(inode)) { fill_mg_cmtime(stat, request_mask, inode); } else { stat->ctime = inode_get_ctime(inode); - stat->mtime = inode_get_mtime(inode); + if (sidechannel_device) + stat->mtime = inode_get_ctime(inode); + else + stat->mtime = inode_get_mtime(inode); } stat->blksize = i_blocksize(inode); @@ -212,6 +225,10 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, ret = inode->i_op->getattr(idmap, path, stat, request_mask, query_flags); + if (!ret && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { + stat->atime = stat->ctime; + stat->mtime = stat->ctime; + } if (ret) return ret; } else { diff --git a/include/linux/cache.h b/include/linux/cache.h index e69768f50d53..432c30a1fc7e 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -60,6 +60,8 @@ #define __ro_after_init __section(".data..ro_after_init") #endif +#define __read_only __ro_after_init + #ifndef ____cacheline_aligned_in_smp #ifdef CONFIG_SMP #define ____cacheline_aligned_in_smp ____cacheline_aligned diff --git a/include/linux/capability.h b/include/linux/capability.h index 37db92b3d6f8..873416ba884c 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -145,6 +145,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); +extern bool capable_noaudit(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); @@ -167,6 +168,10 @@ static inline bool capable(int cap) { return true; } +static inline bool capable_noaudit(int cap) +{ + return true; +} static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; diff --git a/include/linux/fs.h b/include/linux/fs.h index a01621fa636a..0dc2589afdf0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3610,4 +3610,15 @@ static inline bool extensible_ioctl_valid(unsigned int cmd_a, return true; } +extern int device_sidechannel_restrict; + +static inline bool is_sidechannel_device(const struct inode *inode) +{ + umode_t mode; + if (!device_sidechannel_restrict) + return false; + mode = inode->i_mode; + return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); +} + #endif /* _LINUX_FS_H */ diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 28a9cb13fbfa..e16cbb76bc48 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -124,6 +124,9 @@ static inline int fsnotify_file(struct file *file, __u32 mask) if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; + if (mask & (FS_ACCESS | FS_MODIFY) && is_sidechannel_device(file_inode(file))) + return 0; + return fsnotify_path(&file->f_path, mask); } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index abc20f9810fd..548a9d608d58 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -259,6 +259,13 @@ static inline bool tag_clear_highpages(struct page *page, int numpages) #endif +static inline void verify_zero_highpage(struct page *page) +{ + void *kaddr = kmap_atomic(page); + BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); + kunmap_atomic(kaddr); +} + /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index b2bb878abd11..a773b99e58c4 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -614,7 +614,7 @@ static inline void do_softirq_post_smp_call_flush(unsigned int unused) } #endif -extern void open_softirq(int nr, void (*action)(void)); +extern void __init open_softirq(int nr, void (*action)(void)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index 150fe2ae1b6b..67d22bb918c2 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -45,7 +45,7 @@ struct kobj_ns_type_operations { void (*drop_ns)(void *); }; -int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); +int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); int kobj_ns_type_registered(enum kobj_ns_type type); const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *parent); const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fbe94b65b30a..902c6168d1d8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1776,6 +1776,14 @@ static inline int perf_is_paranoid(void) extern int perf_allow_kernel(void); +static inline int perf_allow_open(void) +{ + if (sysctl_perf_event_paranoid > 2 && !perfmon_capable()) + return -EACCES; + + return security_perf_event_open(PERF_SECURITY_OPEN); +} + static inline int perf_allow_cpu(void) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 288fe0055cd5..38aac4de403c 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -185,6 +185,8 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer, int proc_dointvec(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos); +int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, diff --git a/include/linux/tty.h b/include/linux/tty.h index 0a46e4054dec..99c733852fb2 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -14,6 +14,7 @@ #include #include #include +#include /* @@ -240,6 +241,7 @@ struct tty_struct { struct list_head tty_files; struct work_struct SAK_work; + struct user_namespace *owner_user_ns; } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ diff --git a/include/linux/usb.h b/include/linux/usb.h index 32e17626dfdc..e1a34906a0ec 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -2104,6 +2104,17 @@ extern void usb_led_activity(enum usb_led_event ev); static inline void usb_led_activity(enum usb_led_event ev) {} #endif +/* sysctl.c */ +extern int deny_new_usb; +#ifdef CONFIG_SYSCTL +extern int usb_register_sysctl(void); +extern void usb_unregister_sysctl(void); +#else +static inline int usb_register_sysctl(void) { return 0; } +static inline void usb_unregister_sysctl(void) { } +#endif /* CONFIG_SYSCTL */ + + #endif /* __KERNEL__ */ #endif diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 9c3be157397e..bb05d4a07c46 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -173,6 +173,8 @@ static inline struct user_namespace *to_user_ns(struct ns_common *ns) #ifdef CONFIG_USER_NS +extern int unprivileged_userns_clone; + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) @@ -206,6 +208,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else +#define unprivileged_userns_clone 0 + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; diff --git a/include/net/tcp.h b/include/net/tcp.h index d960a53ee1ca..1f6ad9e8ddb3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -287,6 +287,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; +extern int sysctl_tcp_simult_connect; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ diff --git a/init/Kconfig b/init/Kconfig index 6c8ba1a1a8b9..6d2d765d4a56 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -519,6 +519,7 @@ config CROSS_MEMORY_ATTACH config AUDIT bool "Auditing support" depends on NET + default y help Enable auditing infrastructure that can be used with another kernel subsystem, such as SELinux (which requires this for @@ -1404,8 +1405,8 @@ config USER_NS config USER_NS_UNPRIVILEGED bool "Allow unprivileged users to create namespaces" - default y depends on USER_NS + default n help When disabled, unprivileged users will not be able to create new namespaces. Allowing users to create their own namespaces @@ -1416,7 +1417,7 @@ config USER_NS_UNPRIVILEGED This setting can be overridden at runtime via the kernel.unprivileged_userns_clone sysctl. - If unsure, say Y. + If unsure, say N. config PID_NS bool "PID Namespaces" @@ -1690,9 +1691,8 @@ menuconfig EXPERT Only use this if you really know what you are doing. config UID16 - bool "Enable 16-bit UID system calls" if EXPERT + bool "Enable 16-bit UID system calls" depends on HAVE_UID16 && MULTIUSER - default y help This enables the legacy 16-bit UID syscall wrappers. @@ -1878,8 +1878,7 @@ config SHMEM which may be appropriate on small systems without swap. config AIO - bool "Enable AIO support" if EXPERT - default y + bool "Enable AIO support" help This option enables POSIX asynchronous I/O which may by used by some high performance threaded applications. Disabling diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ac1a5cf10287..f609cb02a9a4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -143,7 +143,7 @@ static __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(io_key_has_sqarray, HZ); struct kmem_cache *req_cachep; static struct workqueue_struct *iou_wq __ro_after_init; -static int __read_mostly sysctl_io_uring_disabled; +static int __read_mostly sysctl_io_uring_disabled = 1; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL @@ -153,8 +153,9 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = { .data = &sysctl_io_uring_disabled, .maxlen = sizeof(sysctl_io_uring_disabled), .mode = 0644, + /* only handle a transition from default "1" to "2" */ .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = SYSCTL_TWO, .extra2 = SYSCTL_TWO, }, { diff --git a/kernel/audit.c b/kernel/audit.c index 26a332ffb1b8..c30adeb1650b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1779,6 +1779,9 @@ static int __init audit_enable(char *str) if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; + else if (!audit_ever_enabled) + audit_initialized = AUDIT_UNINITIALIZED; + if (audit_set_enabled(audit_default)) pr_err("audit: error setting audit state (%d)\n", audit_default); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 85c0feaae0d3..277a1b7b3fae 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -543,7 +543,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); -int bpf_jit_harden __read_mostly; +int bpf_jit_harden __read_mostly = 2; long bpf_jit_limit __read_mostly; long bpf_jit_limit_max __read_mostly; diff --git a/kernel/capability.c b/kernel/capability.c index 829f49ae07b9..5bb7ee4028ad 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -416,6 +416,12 @@ bool capable(int cap) return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); + +bool capable_noaudit(int cap) +{ + return ns_capable_noaudit(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable_noaudit); #endif /* CONFIG_MULTIUSER */ /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 84a79e977580..c63139406a70 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -464,8 +464,13 @@ static struct kmem_cache *perf_event_cache; * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv + * 3 - disallow all unpriv perf event use */ +#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT +int sysctl_perf_event_paranoid __read_mostly = 3; +#else int sysctl_perf_event_paranoid __read_mostly = 2; +#endif /* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); @@ -13549,7 +13554,7 @@ SYSCALL_DEFINE5(perf_event_open, return err; /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(PERF_SECURITY_OPEN); + err = perf_allow_open(); if (err) return err; diff --git a/kernel/fork.c b/kernel/fork.c index ecf33c82df7f..b59bea81742b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include #include @@ -123,16 +124,6 @@ #include -#ifdef CONFIG_USER_NS -# ifdef CONFIG_USER_NS_UNPRIVILEGED -static int unprivileged_userns_clone = 1; -# else -static int unprivileged_userns_clone = 0; -# endif -#else -#define unprivileged_userns_clone 1 -#endif - /* * Minimum number of threads to boot the kernel */ @@ -1996,6 +1987,10 @@ __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -2040,11 +2035,6 @@ __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } - if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { - if (!capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - } - /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -3081,10 +3071,6 @@ static int check_unshare_flags(unsigned long unshare_flags) if (!current_is_single_threaded()) return -EINVAL; } - if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - } return 0; } @@ -3167,6 +3153,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { + err = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto bad_unshare_out; + } + err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; @@ -3315,15 +3307,6 @@ static const struct ctl_table fork_sysctl_table[] = { .mode = 0644, .proc_handler = sysctl_max_threads, }, -#ifdef CONFIG_USER_NS - { - .procname = "unprivileged_userns_clone", - .data = &unprivileged_userns_clone, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif }; static int __init init_fork_sysctl(void) diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index da77f3f5c1fe..1418fc33ed1f 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -11,15 +11,6 @@ static const int ten_thousand = 10000; -static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} - static const struct ctl_table printk_sysctls[] = { { .procname = "printk", diff --git a/kernel/softirq.c b/kernel/softirq.c index 77198911b8dd..1c508ce336ae 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -57,7 +57,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); #endif -static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; +static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); DEFINE_PER_CPU(struct task_struct *, ksoftirqd); @@ -790,7 +790,7 @@ void __raise_softirq_irqoff(unsigned int nr) or_softirq_pending(1UL << nr); } -void open_softirq(int nr, void (*action)(void)) +void __init open_softirq(int nr, void (*action)(void)) { softirq_vec[nr].action = action; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2cd767b9680e..e216f47671f1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -22,6 +22,10 @@ #include #include +#ifdef CONFIG_USER_NS +#include +#endif + /* shared constants to be used in various sysctls */ const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; EXPORT_SYMBOL(sysctl_vals); @@ -656,6 +660,35 @@ int proc_douintvec(const struct ctl_table *table, int dir, void *buffer, do_proc_uint_conv); } +/** + * proc_dointvec_minmax_sysadmin - read a vector of integers with min/max values + * checking CAP_SYS_ADMIN on write + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Writing is only allowed when the current task has CAP_SYS_ADMIN. + * + * Returns 0 on success, -EPERM on permission failure or -EINVAL on write + * when the range check fails. + */ +int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (dir && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, dir, buffer, lenp, ppos); +} + /** * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table @@ -1067,6 +1100,12 @@ int proc_doulongvec_minmax(const struct ctl_table *table, int dir, return -ENOSYS; } +int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) @@ -1122,6 +1161,17 @@ int proc_do_static_key(const struct ctl_table *table, int dir, } static const struct ctl_table sysctl_subsys_table[] = { +#ifdef CONFIG_USER_NS + { + .procname = "unprivileged_userns_clone", + .data = &unprivileged_userns_clone, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "sysctl_writes_strict", @@ -1183,6 +1233,7 @@ EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL(proc_dointvec_minmax_sysadmin); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_do_large_bitmap); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 03cb63883d04..597c33a2ea16 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -23,6 +23,13 @@ #include #include +/* sysctl */ +#ifdef CONFIG_USER_NS_UNPRIVILEGED +int unprivileged_userns_clone = 1; +#else +int unprivileged_userns_clone; +#endif + static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 52c7a3a89f08..1cb2e7286072 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -519,6 +519,9 @@ config SECTION_MISMATCH_WARN_ONLY If unsure, say Y. +config DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE + bool "Enable verbose reporting of writable function pointers" + config DEBUG_FORCE_FUNCTION_ALIGN_64B bool "Force all function address 64B aligned" depends on EXPERT && (X86_64 || ARM64 || PPC32 || PPC64 || ARC || RISCV || S390) @@ -683,7 +686,7 @@ config DEBUG_FS choice prompt "Debugfs default access" depends on DEBUG_FS - default DEBUG_FS_ALLOW_ALL + default DEBUG_FS_ALLOW_NONE help This selects the default access restrictions for debugfs. It can be overridden with kernel command line option @@ -1058,6 +1061,7 @@ menu "Debug Oops, Lockups and Hangs" config PANIC_ON_OOPS bool "Panic on Oops" + default y help Say Y here to enable the kernel to panic when it oopses. This has the same effect as setting oops=panic on the kernel command @@ -1067,7 +1071,7 @@ config PANIC_ON_OOPS anything erroneous after an oops which could result in data corruption or other issues. - Say N if unsure. + Say Y if unsure. config PANIC_TIMEOUT int "panic timeout" @@ -1895,6 +1899,7 @@ config STRICT_DEVMEM config IO_STRICT_DEVMEM bool "Filter I/O access to /dev/mem" depends on STRICT_DEVMEM + default y help If this option is disabled, you allow userspace (root) access to all io-memory regardless of whether a driver is actively using that diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index 6fbbebec683a..e494618f7193 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -96,4 +96,13 @@ config KFENCE_KUNIT_TEST during boot; say M if you want the test to build as a module; say N if you are unsure. +config KFENCE_BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + default y + help + Select this option if the kernel should BUG when kfence encounters + data corruption of kfence managed objects after error report. + + If unsure, say Y. + endif # KFENCE diff --git a/lib/kobject.c b/lib/kobject.c index abe5f5b856ce..a80fa9d8f2db 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -1019,9 +1019,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); static DEFINE_SPINLOCK(kobj_ns_type_lock); -static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; +static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; -int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) +int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) { enum kobj_ns_type type = ops->type; int error; diff --git a/lib/nlattr.c b/lib/nlattr.c index be9c576b6e2d..484d839bcf5e 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -837,6 +837,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); + BUG_ON(minlen < 0); + memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index a3790c43a0ab..ac945a6801da 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -855,7 +855,7 @@ static char *default_pointer(char *buf, char *end, const void *ptr, return ptr_to_id(buf, end, ptr, spec); } -int kptr_restrict __read_mostly; +int kptr_restrict __read_mostly = 2; static noinline_for_stack char *restricted_pointer(char *buf, char *end, const void *ptr, diff --git a/mm/Kconfig b/mm/Kconfig index df5f168a3f30..fc42b3d1ffa6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -188,7 +188,6 @@ config SLUB_TINY config SLAB_MERGE_DEFAULT bool "Allow slab caches to be merged" - default y help For reduced kernel memory fragmentation, slab caches can be merged when they share the same size and other characteristics. @@ -203,6 +202,7 @@ config SLAB_MERGE_DEFAULT config SLAB_FREELIST_RANDOM bool "Randomize slab freelist" depends on !SLUB_TINY + default y help Randomizes the freelist order used on creating new pages. This security feature reduces the predictability of the kernel slab @@ -211,6 +211,7 @@ config SLAB_FREELIST_RANDOM config SLAB_FREELIST_HARDENED bool "Harden slab freelist metadata" depends on !SLUB_TINY + default y help Many kernel heap attacks try to target slab cache metadata and other infrastructure. This options makes minor performance @@ -234,6 +235,23 @@ config SLAB_BUCKETS If unsure, say Y. +config SLAB_CANARY + depends on SLUB + depends on !SLAB_MERGE_DEFAULT + bool "SLAB canaries" + default y + help + Place canaries at the end of kernel slab allocations, sacrificing + some performance and memory usage for security. + + Canaries can detect some forms of heap corruption when allocations + are freed and as part of the HARDENED_USERCOPY feature. It provides + basic use-after-free detection for HARDENED_USERCOPY. + + Canaries absorb small overflows (rendering them harmless), mitigate + non-NUL terminated C string overflows on 64-bit via a guaranteed zero + byte and provide basic double-free detection. + config SLUB_STATS default n bool "Enable performance statistics" @@ -259,7 +277,7 @@ config SLUB_CPU_PARTIAL Typically one would choose no for a realtime system. config RANDOM_KMALLOC_CACHES - default n + default y depends on !SLUB_TINY bool "Randomize slab caches for normal kmalloc" help @@ -302,7 +320,6 @@ config SHUFFLE_PAGE_ALLOCATOR config COMPAT_BRK bool "Disable heap randomization" - default y help Randomizing heap placement makes heap exploits harder, but it also breaks ancient binaries (including anything libc5 based). @@ -780,7 +797,8 @@ config KSM config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" depends on MMU - default 4096 + default 32768 if ARM || (ARM64 && COMPAT) + default 65536 help This is the portion of low virtual memory which should be protected from userspace allocation. Keeping a user from writing to low pages diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 7638d75b27db..08ae2acd7b1b 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -47,7 +47,7 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT config SLUB_DEBUG default y - bool "Enable SLUB debugging support" if EXPERT + bool "Enable SLUB debugging support" depends on SYSFS && !SLUB_TINY select STACKDEPOT if STACKTRACE_SUPPORT help @@ -189,6 +189,7 @@ config DEBUG_WX depends on ARCH_HAS_PTDUMP depends on MMU select PTDUMP + default y help Generate a warning if any W+X mappings are found at boot. diff --git a/mm/internal.h b/mm/internal.h index f35dbcf99a86..3fabad7b315c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -781,6 +781,9 @@ static inline struct folio *page_rmappable_folio(struct page *page) return folio; } +extern void __init __gather_extra_latent_entropy(struct page *page, + unsigned int nr_pages); + static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 10e6802a2edf..01f8700fafc6 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -278,6 +279,10 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r lockdep_on(); +#ifdef CONFIG_KFENCE_BUG_ON_DATA_CORRUPTION + BUG(); +#endif + check_panic_on_warn("KFENCE"); /* We encountered a memory safety error, taint the kernel! */ diff --git a/mm/mm_init.c b/mm/mm_init.c index 68b3a05a0357..ad45f2a232d8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2001,6 +2001,7 @@ static void __init deferred_free_pages(unsigned long pfn, for (i = 0; i < nr_pages; i += pageblock_nr_pages) init_pageblock_migratetype(page + i, MIGRATE_MOVABLE, false); + __gather_extra_latent_entropy(page, 1 << MAX_PAGE_ORDER); __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY); return; } @@ -2012,6 +2013,7 @@ static void __init deferred_free_pages(unsigned long pfn, if (pageblock_aligned(pfn)) init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); + __gather_extra_latent_entropy(page, 1); __free_pages_core(page, 0, MEMINIT_EARLY); } } @@ -2497,6 +2499,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* pages were reserved and not allocated */ clear_page_tag_ref(page); + __gather_extra_latent_entropy(page, 1 << order); __free_pages_core(page, order, MEMINIT_EARLY); } diff --git a/mm/mmap.c b/mm/mmap.c index 4bdb9ffa9e25..50dbefad21a2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -152,6 +152,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); + /* properly handle unaligned min_brk as an empty heap */ + if (min_brk & ~PAGE_MASK) { + if (brk == min_brk) + newbrk -= PAGE_SIZE; + if (mm->brk == min_brk) + oldbrk -= PAGE_SIZE; + } if (oldbrk == newbrk) { mm->brk = brk; goto success; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 312a0a77ecda..b381e70930da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -214,6 +214,15 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); static DEFINE_MUTEX(pcpu_drain_mutex); +bool __meminitdata extra_latent_entropy; + +static int __init setup_extra_latent_entropy(char *str) +{ + extra_latent_entropy = true; + return 0; +} +early_param("extra_latent_entropy", setup_extra_latent_entropy); + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); @@ -1620,6 +1629,25 @@ static void __free_pages_ok(struct page *page, unsigned int order, free_one_page(zone, page, pfn, order, fpi_flags); } +void __init __gather_extra_latent_entropy(struct page *page, + unsigned int nr_pages) +{ + if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { + unsigned long hash = 0; + size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; + const unsigned long *data = lowmem_page_address(page); + + for (index = 0; index < end; index++) + hash ^= hash + data[index]; +#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY + latent_entropy ^= hash; + add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); +#else + add_device_randomness((const void *)&hash, sizeof(hash)); +#endif + } +} + void __meminit __free_pages_core(struct page *page, unsigned int order, enum meminit_context context) { @@ -1861,6 +1889,12 @@ inline void post_alloc_hook(struct page *page, unsigned int order, */ kernel_unpoison_pages(page, 1 << order); + if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY) && want_init_on_free()) { + int i; + for (i = 0; i < (1 << order); i++) + verify_zero_highpage(page + i); + } + /* * As memory initialization might be integrated into KASAN, * KASAN unpoisoning and memory initializion code must be diff --git a/mm/slab.h b/mm/slab.h index e767aa7e91b0..da36513b93cd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -254,6 +254,12 @@ struct kmem_cache { unsigned long random; #endif +#ifdef CONFIG_SLAB_CANARY + unsigned long random_active; + unsigned long random_inactive; + unsigned long sheaf_random_active; +#endif + #ifdef CONFIG_NUMA /* * Defragmentation by allocating from a remote node. @@ -574,7 +580,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) * back there or track user information then we can * only use the space before that information. */ - if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) return s->inuse; /* * Else we can use all the padding etc for the allocation @@ -619,8 +625,10 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) { +#ifndef CONFIG_SLUB if (c->ctor) return false; +#endif if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) return flags & __GFP_ZERO; return true; @@ -631,9 +639,15 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) static inline bool slab_want_init_on_free(struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, - &init_on_free)) - return !(c->ctor || - (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + &init_on_free)) { +#ifndef CONFIG_SLUB + if (c->ctor) + return false; +#endif + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return false; + return true; + } return false; } diff --git a/mm/slab_common.c b/mm/slab_common.c index ee994ec7f251..8c1f9a124d4b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,10 +37,10 @@ #define CREATE_TRACE_POINTS #include -enum slab_state slab_state; +enum slab_state slab_state __ro_after_init; LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); -struct kmem_cache *kmem_cache; +struct kmem_cache *kmem_cache __ro_after_init; /* * Set of flags that will prevent slab merging @@ -55,7 +55,7 @@ struct kmem_cache *kmem_cache; /* * Merge control. If this is set then no merging of slab caches will occur. */ -static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); +static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); static int __init setup_slab_nomerge(char *str) { diff --git a/mm/slub.c b/mm/slub.c index 92f891816bb8..6200d6c14823 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -254,6 +255,12 @@ static inline bool kmem_cache_debug(struct kmem_cache *s) return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } +static inline bool has_sanitize_verify(struct kmem_cache *s) +{ + return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && + slab_want_init_on_free(s); +} + void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) @@ -866,6 +873,8 @@ static inline void set_orig_size(struct kmem_cache *s, return; p += get_info_end(s); + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + p = (void *)p + sizeof(void *); p += sizeof(struct track) * 2; *(unsigned long *)p = orig_size; @@ -882,11 +891,52 @@ static inline unsigned long get_orig_size(struct kmem_cache *s, void *object) return s->object_size; p += get_info_end(s); + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + p = (void *)p + sizeof(void *); p += sizeof(struct track) * 2; return *(unsigned long *)p; } +#ifdef CONFIG_SLAB_CANARY +static inline unsigned long *get_canary(struct kmem_cache *s, void *object) +{ + return object + get_info_end(s); +} + +static inline unsigned long get_canary_value(const void *canary, unsigned long value) +{ + return (value ^ (unsigned long)canary) & CANARY_MASK; +} + +static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) +{ + unsigned long *canary = get_canary(s, object); + *canary = get_canary_value(canary, value); +} + +static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) +{ + unsigned long *canary = get_canary(s, object); + BUG_ON(*canary != get_canary_value(canary, value)); +} + +static inline void check_set_canary_bulk(struct kmem_cache *s, unsigned int size, void **objects, unsigned long check_value, unsigned long set_value) +{ + for (int i = 0; i < size; i++) { + if (!is_kfence_address(objects[i])) { + check_canary(s, objects[i], check_value); + set_canary(s, objects[i], set_value); + } + } +} + +#else +#define set_canary(s, object, value) +#define check_canary(s, object, value) +#define check_set_canary_bulk(s, size, objects, check_value, set_value) +#endif + #ifdef CONFIG_SLUB_DEBUG /* @@ -968,13 +1018,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) * Debug settings: */ #if defined(CONFIG_SLUB_DEBUG_ON) -static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; +static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; #else -static slab_flags_t slub_debug; +static slab_flags_t slub_debug __ro_after_init; #endif static const char *slub_debug_string __ro_after_init; -static int disable_higher_order_debug; +static int disable_higher_order_debug __ro_after_init; /* * slub is about to manipulate internal object metadata. This memory lies @@ -1034,6 +1084,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, p = object + get_info_end(s); + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + p = (void *)p + sizeof(void *); + return kasan_reset_tag(p + alloc); } @@ -1195,6 +1248,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) off = get_info_end(s); + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + off += sizeof(void *); + if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); @@ -1370,9 +1426,10 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab, * Meta data starts here. * * A. Free pointer (if we cannot overwrite object on free) - * B. Tracking data for SLAB_STORE_USER - * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) - * D. Padding to reach required alignment boundary or at minimum + * B. Canary for SLAB_CANARY + * C. Tracking data for SLAB_STORE_USER + * D. Original request size for kmalloc object (SLAB_STORE_USER enabled) + * E. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -1390,6 +1447,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + off += sizeof(void *); + if (s->flags & SLAB_STORE_USER) { /* We also have user information there */ off += 2 * sizeof(struct track); @@ -2497,11 +2557,19 @@ struct rcu_delayed_free { */ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init, - bool after_rcu_delay) + bool after_rcu_delay, bool canary) { /* Are the object contents still accessible? */ bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay; + /* + * Postpone setting the inactive canary until the metadata + * has potentially been cleared at the end of this function. + */ + if (canary) { + check_canary(s, x, s->random_active); + } + kmemleak_free_recursive(x, s->flags); kmsan_slab_free(s, x); @@ -2567,15 +2635,28 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, if (!kasan_has_integrated_init()) memset(kasan_reset_tag(x), 0, orig_size); rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; + +#ifdef CONFIG_SLAB_CANARY + memset((char *)kasan_reset_tag(x) + inuse + sizeof(void *), 0, + s->size - inuse - sizeof(void *) - rsize); +#else memset((char *)kasan_reset_tag(x) + inuse, 0, s->size - inuse - rsize); +#endif /* * Restore orig_size, otherwise kmalloc redzone overwritten * would be reported */ set_orig_size(s, x, orig_size); + if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) + s->ctor(x); } + + if (canary) { + set_canary(s, x, s->random_inactive); + } + /* KASAN might put x into memory quarantine, delaying its reuse. */ return !kasan_slab_free(s, x, init, still_accessible, false); } @@ -2591,7 +2672,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, bool init; if (is_kfence_address(next)) { - slab_free_hook(s, next, false, false); + slab_free_hook(s, next, false, false, false); return false; } @@ -2606,7 +2687,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(slab_free_hook(s, object, init, false))) { + if (likely(slab_free_hook(s, object, init, false, true))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -2618,6 +2699,22 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, * accordingly if object's reuse is delayed. */ --(*cnt); + + /* Objects that are put into quarantine by KASAN will + * still undergo free_consistency_checks(), which + * checks whether the freelist pointer is valid if it + * is located after the object (see check_object()). + * Since this is the case for slab caches with + * constructors, we need to fix the freelist pointer + * after init_on_free has overwritten it. + * + * Note that doing this for all caches (not just ctor + * ones) would cause a GPF due to KASAN poisoning and + * the way set_freepointer() eventually dereferences + * the freepointer. + */ + if (slab_want_init_on_free(s) && s->ctor) + set_freepointer(s, object, NULL); } } while (object != old_tail); @@ -2627,8 +2724,9 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, static void *setup_object(struct kmem_cache *s, void *object) { setup_object_debug(s, object); + set_canary(s, object, s->random_inactive); object = kasan_init_slab_obj(s, object); - if (unlikely(s->ctor)) { + if (unlikely(s->ctor) && !has_sanitize_verify(s)) { kasan_unpoison_new_object(s, object); s->ctor(object); kasan_poison_new_object(s, object); @@ -2756,6 +2854,7 @@ static unsigned int __sheaf_flush_main_batch(struct kmem_cache *s) local_unlock(&s->cpu_sheaves->lock); + check_set_canary_bulk(s, batch, &objects[0], s->random_active, s->random_inactive); __kmem_cache_free_bulk(s, batch, &objects[0]); stat_add(s, SHEAF_FLUSH, batch); @@ -2801,20 +2900,24 @@ static bool sheaf_try_flush_main(struct kmem_cache *s) * necessary when flushing cpu's sheaves (both spare and main) during cpu * hotremove as the cpu is not executing anymore. */ -static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) +static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf, bool canary) { if (!sheaf->size) return; stat_add(s, SHEAF_FLUSH, sheaf->size); + if (canary) { + check_set_canary_bulk(s, sheaf->size, &sheaf->objects[0], s->random_active, s->random_inactive); + } __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); sheaf->size = 0; } static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, - struct slab_sheaf *sheaf) + struct slab_sheaf *sheaf, + bool canary) { bool init = slab_want_init_on_free(s); void **p = &sheaf->objects[0]; @@ -2827,7 +2930,7 @@ static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, memcg_slab_free_hook(s, slab, p + i, 1); alloc_tagging_slab_free_hook(s, slab, p + i, 1); - if (unlikely(!slab_free_hook(s, p[i], init, true))) { + if (unlikely(!slab_free_hook(s, p[i], init, true, canary))) { p[i] = p[--sheaf->size]; continue; } @@ -2849,9 +2952,9 @@ static void rcu_free_sheaf_nobarn(struct rcu_head *head) sheaf = container_of(head, struct slab_sheaf, rcu_head); s = sheaf->cache; - __rcu_free_sheaf_prepare(s, sheaf); + __rcu_free_sheaf_prepare(s, sheaf, true); - sheaf_flush_unused(s, sheaf); + sheaf_flush_unused(s, sheaf, false); free_empty_sheaf(s, sheaf); } @@ -2882,7 +2985,7 @@ static void pcs_flush_all(struct kmem_cache *s) local_unlock(&s->cpu_sheaves->lock); if (spare) { - sheaf_flush_unused(s, spare); + sheaf_flush_unused(s, spare, true); free_empty_sheaf(s, spare); } @@ -2899,9 +3002,9 @@ static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) pcs = per_cpu_ptr(s->cpu_sheaves, cpu); /* The cpu is not executing anymore so we don't need pcs->lock */ - sheaf_flush_unused(s, pcs->main); + sheaf_flush_unused(s, pcs->main, true); if (pcs->spare) { - sheaf_flush_unused(s, pcs->spare); + sheaf_flush_unused(s, pcs->spare, true); free_empty_sheaf(s, pcs->spare); pcs->spare = NULL; } @@ -3116,7 +3219,7 @@ static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) spin_unlock_irqrestore(&barn->lock, flags); list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) { - sheaf_flush_unused(s, sheaf); + sheaf_flush_unused(s, sheaf, true); free_empty_sheaf(s, sheaf); } @@ -5046,6 +5149,8 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); + if (p[i] && init && s->ctor) + s->ctor(p[i]); if (gfpflags_allow_spinning(flags)) kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, init_flags); @@ -5231,6 +5336,11 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) pcs->main->size--; + if (!is_kfence_address(object)) { + check_canary(s, object, s->random_active); + set_canary(s, object, s->sheaf_random_active); + } + local_unlock(&s->cpu_sheaves->lock); stat(s, ALLOC_PCS); @@ -5296,6 +5406,8 @@ unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) main->size -= batch; memcpy(p, main->objects + main->size, batch * sizeof(void *)); + check_set_canary_bulk(s, batch, p, s->random_active, s->sheaf_random_active); + local_unlock(&s->cpu_sheaves->lock); stat_add(s, ALLOC_PCS, batch); @@ -5327,6 +5439,8 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list { void *object; bool init = false; + bool from_pcs = false; + bool from_pcs_failed = false; s = slab_pre_alloc_hook(s, gfpflags); if (unlikely(!s)) @@ -5336,14 +5450,45 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - if (s->cpu_sheaves) + if (s->cpu_sheaves) { object = alloc_from_pcs(s, gfpflags, node); + from_pcs = true; + } - if (!object) + if (!object) { object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + if (from_pcs) + from_pcs_failed = true; + } maybe_wipe_obj_freeptr(s, object); - init = slab_want_init_on_alloc(gfpflags, s); + + if (has_sanitize_verify(s) && object) { + /* KASAN hasn't unpoisoned the object yet (this is done in the + * post-alloc hook), so let's do it temporarily. + */ + kasan_unpoison_new_object(s, object); + BUG_ON(memchr_inv(object, 0, s->object_size)); + if (s->ctor) + s->ctor(object); + kasan_poison_new_object(s, object); + } else { + init = slab_want_init_on_alloc(gfpflags, s); + } + + /* + * linux-hardened: In the scenario where an object is intended to be allocated + * from a sheaf but it's allocation failed, it is instead directly allocated from the + * slab allocator but will later be freed back to a sheaf. We thus need to + * set the canary to a sheaf_random_active. + */ + if (object && !from_pcs) { + check_canary(s, object, s->random_inactive); + set_canary(s, object, s->random_active); + } else if (object && from_pcs_failed) { + check_canary(s, object, s->random_inactive); + set_canary(s, object, s->sheaf_random_active); + } out: /* @@ -5517,7 +5662,7 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) if (sheaf->size < size && __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) { - sheaf_flush_unused(s, sheaf); + sheaf_flush_unused(s, sheaf, true); free_empty_sheaf(s, sheaf); sheaf = NULL; } @@ -5544,7 +5689,7 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, if (unlikely((sheaf->capacity != s->sheaf_capacity) || sheaf->pfmemalloc)) { - sheaf_flush_unused(s, sheaf); + sheaf_flush_unused(s, sheaf, true); kfree(sheaf); return; } @@ -5572,7 +5717,7 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, */ if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES || refill_sheaf(s, sheaf, gfp)) { - sheaf_flush_unused(s, sheaf); + sheaf_flush_unused(s, sheaf, true); free_empty_sheaf(s, sheaf); return; } @@ -5665,6 +5810,11 @@ kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, /* add __GFP_NOFAIL to force successful memcg charging */ slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size); + + if (!is_kfence_address(ret)) { + check_canary(s, ret, s->random_active); + set_canary(s, ret, s->sheaf_random_active); + } out: trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE); @@ -6219,7 +6369,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) pcs->spare = NULL; local_unlock(&s->cpu_sheaves->lock); - sheaf_flush_unused(s, to_flush); + sheaf_flush_unused(s, to_flush, true); empty = to_flush; goto got_empty; } @@ -6293,6 +6443,10 @@ bool free_to_pcs(struct kmem_cache *s, void *object) return false; } + if (!is_kfence_address(object)) { + check_canary(s, object, s->sheaf_random_active); + set_canary(s, object, s->random_active); + } pcs->main->objects[pcs->main->size++] = object; local_unlock(&s->cpu_sheaves->lock); @@ -6324,7 +6478,7 @@ static void rcu_free_sheaf(struct rcu_head *head) * If it returns true, there was at least one object from pfmemalloc * slab so simply flush everything. */ - if (__rcu_free_sheaf_prepare(s, sheaf)) + if (__rcu_free_sheaf_prepare(s, sheaf, false)) goto flush; n = get_node(s, sheaf->node); @@ -6351,7 +6505,7 @@ static void rcu_free_sheaf(struct rcu_head *head) flush: stat(s, BARN_PUT_FAIL); - sheaf_flush_unused(s, sheaf); + sheaf_flush_unused(s, sheaf, true); empty: if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { @@ -6442,6 +6596,11 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) * Since we flush immediately when size reaches capacity, we never reach * this with size already at capacity, so no OOB write is possible. */ + + if (!is_kfence_address(obj)) { + check_canary(s, obj, s->sheaf_random_active); + set_canary(s, obj, s->random_active); + } rcu_sheaf->objects[rcu_sheaf->size++] = obj; if (likely(rcu_sheaf->size < s->sheaf_capacity)) { @@ -6493,11 +6652,16 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) memcg_slab_free_hook(s, slab, p + i, 1); alloc_tagging_slab_free_hook(s, slab, p + i, 1); - if (unlikely(!slab_free_hook(s, p[i], init, false))) { + if (unlikely(!slab_free_hook(s, p[i], init, false, false))) { p[i] = p[--size]; continue; } + if (!is_kfence_address(p[i])) { + check_canary(s, p[i], s->sheaf_random_active); + set_canary(s, p[i], s->random_active); + } + if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) || slab_test_pfmemalloc(slab))) { remote_objects[remote_nr] = p[i]; @@ -6580,10 +6744,12 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) * many full sheaves, free the rest to slab pages */ fallback: + check_set_canary_bulk(s, size, p, s->random_active, s->random_inactive); __kmem_cache_free_bulk(s, size, p); flush_remote: if (remote_nr) { + check_set_canary_bulk(s, remote_nr, &remote_objects[0], s->random_active, s->random_inactive); __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); if (i < size) { remote_nr = 0; @@ -6785,19 +6951,43 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) { + bool canary = true; + bool to_sheaf = false; + memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) + /* Make sure canaries are not used on kfence objects. */ + if (is_kfence_address(object)) + canary = false; + + /* Defer canary checking if the object is freed back to pcs. */ + if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || + slab_nid(slab) == numa_mem_id())) { + canary = false; + } + + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false, canary))) return; if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) && likely(!slab_test_pfmemalloc(slab))) { + to_sheaf = true; if (likely(free_to_pcs(s, object))) return; } + /* + * linux-hardened: In this scenario, the object was intended to be freed to a + * sheaf but it failed. The object will thus be freed back to the slab allocator, + * the canary thus need to be checked as a sheaf one and set back to a slab inactive one. + */ + if (to_sheaf && canary) { + check_canary(s, object, s->sheaf_random_active); + set_canary(s, object, s->random_inactive); + } + do_slab_free(s, slab, object, object, 1, addr); } @@ -6806,11 +6996,16 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) { + bool canary = true; struct slab *slab = virt_to_slab(object); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) + /* Make sure canaries are not used on kfence objects. */ + if (is_kfence_address(object)) + canary = false; + + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false, canary))) do_slab_free(s, slab, object, object, 1, _RET_IP_); } #endif @@ -6851,7 +7046,7 @@ static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) return; /* resume freeing */ - if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) + if (slab_free_hook(s, object, slab_want_init_on_free(s), true, true)) do_slab_free(s, slab, object, object, 1, _THIS_IP_); } #endif /* CONFIG_SLUB_RCU_DEBUG */ @@ -6868,8 +7063,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) struct slab *slab; slab = virt_to_slab(obj); +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION + BUG_ON(!slab); +#else if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) return NULL; +#endif return slab->slab_cache; } @@ -6882,10 +7081,15 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; cachep = virt_to_cache(x); - if (WARN(cachep && cachep != s, - "%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name)) + if (cachep && cachep != s) { +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION + BUG(); +#else + WARN(1, "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); print_tracking(cachep, x); +#endif + } return cachep; } @@ -6916,8 +7120,12 @@ static void free_large_kmalloc(struct page *page, void *object) return; } +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION + BUG_ON(order == 0); +#else if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); +#endif kmemleak_free(object); kasan_kfree_large(object); @@ -7574,6 +7782,23 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); slub_put_cpu_ptr(s->cpu_slab); + if (has_sanitize_verify(s)) { + int j; + + for (j = 0; j < i; j++) { + /* KASAN hasn't unpoisoned the object yet (this is done in the + * post-alloc hook), so let's do it temporarily. + */ + kasan_unpoison_new_object(s, p[j]); + BUG_ON(memchr_inv(p[j], 0, s->object_size)); + if (s->ctor) + s->ctor(p[j]); + kasan_poison_new_object(s, p[j]); + } + } + + check_set_canary_bulk(s, i, p, s->random_inactive, s->random_active); + return i; error: @@ -7589,6 +7814,7 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, { unsigned int i = 0; void *kfence_obj; + bool init = false; if (!size) return 0; @@ -7620,8 +7846,10 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, * the percpu sheaves, we have bigger problems. */ if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { - if (i > 0) + if (i > 0) { + check_set_canary_bulk(s, i, p, s->random_active, s->random_inactive); __kmem_cache_free_bulk(s, i, p); + } if (kfence_obj) __kfence_free(kfence_obj); return 0; @@ -7643,8 +7871,11 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. */ + if (!has_sanitize_verify(s)) { + init = slab_want_init_on_alloc(flags, s); + } if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p, - slab_want_init_on_alloc(flags, s), s->object_size))) { + init, s->object_size))) { return 0; } @@ -7671,10 +7902,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); * and increases the number of allocations possible without having to * take the list_lock. */ -static unsigned int slub_min_order; -static unsigned int slub_max_order = +static unsigned int slub_min_order __ro_after_init; +static unsigned int slub_max_order __ro_after_init = IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER; -static unsigned int slub_min_objects; +static unsigned int slub_min_objects __ro_after_init; /* * Calculate the order of allocation given an slab object size. @@ -7872,6 +8103,7 @@ static void early_kmem_cache_node_alloc(int node) #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); #endif + set_canary(kmem_cache_node, n, kmem_cache_node->random_active); n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; @@ -8062,6 +8294,9 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); } + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + size += sizeof(void *); + #ifdef CONFIG_SLUB_DEBUG if (flags & SLAB_STORE_USER) { /* @@ -8385,6 +8620,10 @@ void __check_heap_object(const void *ptr, unsigned long n, offset -= s->red_left_pad; } + if (!is_kfence) { + check_canary(s, (void *)ptr - offset, s->random_active); + } + /* Allow address range falling entirely within usercopy region. */ if (offset >= s->useroffset && offset - s->useroffset <= s->usersize && @@ -8711,6 +8950,14 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, s->flags = kmem_cache_flags(flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); +#endif +#ifdef CONFIG_SLAB_CANARY + s->random_active = get_random_long(); + s->random_inactive = get_random_long(); + if (__slub_debug_enabled()) + s->sheaf_random_active = s->random_active; + else + s->sheaf_random_active = get_random_long(); #endif s->align = args->align; s->ctor = args->ctor; diff --git a/mm/util.c b/mm/util.c index 6aa1e89172f8..3e7b8a015f4a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -391,9 +391,9 @@ unsigned long __weak arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) - return randomize_page(mm->brk, SZ_32M); + return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; - return randomize_page(mm->brk, SZ_1G); + return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; } unsigned long arch_mmap_rnd(void) diff --git a/mm/vma_exec.c b/mm/vma_exec.c index 8134e1afca68..4e747ea55e52 100644 --- a/mm/vma_exec.c +++ b/mm/vma_exec.c @@ -7,6 +7,7 @@ #include "vma_internal.h" #include "vma.h" +#include /* * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between @@ -151,6 +152,8 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, mmap_write_unlock(mm); *vmap = vma; *top_mem_p = vma->vm_end - sizeof(void *); + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + *top_mem_p ^= get_random_u32() & ~PAGE_MASK; return 0; err: diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 0db06aca87ca..ec76f6508872 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -267,6 +267,7 @@ config IP_PIMSM_V2 config SYN_COOKIES bool "IP: TCP syncookie support" + default y help Normal TCP/IP networking is open to an attack known as "SYN flooding". This denial-of-service attack prevents legitimate remote @@ -772,3 +773,26 @@ config TCP_MD5SIG on the Internet. If unsure, say N. + +config TCP_SIMULT_CONNECT_DEFAULT_ON + bool "Enable TCP simultaneous connect" + help + Enable TCP simultaneous connect that adds a weakness in Linux's strict + implementation of TCP that allows two clients to connect to each other + without either entering a listening state. The weakness allows an + attacker to easily prevent a client from connecting to a known server + provided the source port for the connection is guessed correctly. + + As the weakness could be used to prevent an antivirus or IPS from + fetching updates, or prevent an SSL gateway from fetching a CRL, it + should be eliminated by disabling this option. Though Linux is one of + few operating systems supporting simultaneous connect, it has no + legitimate use in practice and is rarely supported by firewalls. + + Disabling this may break TCP STUNT which is used by some applications + for NAT traversal. + + This setting can be overridden at runtime via the + net.ipv4.tcp_simult_connect sysctl. + + If unsure, say N. diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a96875e32050..341582247cb8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -622,6 +622,15 @@ static struct ctl_table ipv4_table[] = { .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, + { + .procname = "tcp_simult_connect", + .data = &sysctl_tcp_simult_connect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static struct ctl_table ipv4_net_table[] = { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7ee6e8a3331b..c2117f78d6aa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,6 +85,7 @@ #include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; +int sysctl_tcp_simult_connect __read_mostly = IS_ENABLED(CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON); #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -6830,7 +6831,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, SKB_DR_SET(reason, TCP_RFC7323_PAWS); goto discard_and_undo; } - if (th->syn) { + if (th->syn && sysctl_tcp_simult_connect) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. * Particularly, it can be connect to self. diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index d7d45067d08b..b501130c534c 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -47,6 +47,7 @@ modpost-args = \ $(if $(CONFIG_EXTENDED_MODVERSIONS),-x) \ $(if $(CONFIG_MODULE_SRCVERSION_ALL),-a) \ $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ + $(if $(CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE),-f) \ $(if $(KBUILD_MODPOST_WARN),-w) \ $(if $(KBUILD_NSDEPS),-d modules.nsdeps) \ $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) \ diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index 6b34ba19358d..d83e715c9d40 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -29,6 +29,11 @@ config GCC_PLUGIN_LATENT_ENTROPY is some slowdown of the boot process (about 0.5%) and fork and irq processing. + When extra_latent_entropy is passed on the kernel command line, + entropy will be extracted from up to the first 4GB of RAM while the + runtime memory allocator is being initialized. This costs even more + slowdown of the boot process. + Note that entropy extracted this way is not cryptographically secure! diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index e30ae69cde34..9330d9adc74a 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -49,6 +49,8 @@ static bool sec_mismatch_warn_only = true; /* Trim EXPORT_SYMBOLs that are unused by in-tree modules */ static bool trim_unused_exports; +static int writable_fptr_count = 0; +static int writable_fptr_verbose = false; /* ignore missing files */ static bool ignore_missing_files; /* If set to 1, only warn (instead of error) about missing ns imports */ @@ -814,6 +816,7 @@ enum mismatch { ANY_INIT_TO_ANY_EXIT, ANY_EXIT_TO_ANY_INIT, EXTABLE_TO_NON_TEXT, + DATA_TO_TEXT }; /** @@ -870,6 +873,12 @@ static const struct sectioncheck sectioncheck[] = { .bad_tosec = { ".altinstr_replacement", NULL }, .good_tosec = {ALL_TEXT_SECTIONS , NULL}, .mismatch = EXTABLE_TO_NON_TEXT, +}, +/* Do not reference code from writable data */ +{ + .fromsec = { DATA_SECTIONS, NULL }, + .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, + .mismatch = DATA_TO_TEXT } }; @@ -1035,7 +1044,13 @@ static void default_mismatch_handler(const char *modname, struct elf_info *elf, if (!secref_whitelist(fromsec, fromsym, tosec, tosym)) return; - sec_mismatch_count++; + if (mismatch->mismatch == DATA_TO_TEXT) { + writable_fptr_count++; + if (!writable_fptr_verbose) + return; + } else { + sec_mismatch_count++; + } if (!tosym[0]) snprintf(taddr_str, sizeof(taddr_str), "0x%x", (unsigned int)taddr); @@ -1069,6 +1084,11 @@ static void default_mismatch_handler(const char *modname, struct elf_info *elf, else error("%s+0x%lx references non-executable section '%s'\n", fromsec, (long)faddr, tosec); + } else if (mismatch->mismatch == DATA_TO_TEXT) { + fprintf(stderr, + "The %s:%s references\n" + "the %s:%s\n", + fromsec, fromsym, tosec, tosym); } } @@ -2286,7 +2306,7 @@ int main(int argc, char **argv) LIST_HEAD(dump_lists); struct dump_list *dl, *dl2; - while ((opt = getopt(argc, argv, "ei:MmnT:to:au:WwENd:xb")) != -1) { + while ((opt = getopt(argc, argv, "ei:fMmnT:to:au:WwENd:xb")) != -1) { switch (opt) { case 'e': external_module = true; @@ -2296,6 +2316,9 @@ int main(int argc, char **argv) dl->file = optarg; list_add_tail(&dl->list, &dump_lists); break; + case 'f': + writable_fptr_verbose = true; + break; case 'M': module_enabled = true; break; @@ -2396,5 +2419,11 @@ int main(int argc, char **argv) warn("suppressed %u unresolved symbol warnings because there were too many)\n", nr_unresolved - MAX_UNRESOLVED_REPORTS); + if (writable_fptr_count && !writable_fptr_verbose) + warn("modpost: Found %d writable function pointer%s.\n" + "To see full details build your kernel with:\n" + "'make CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE=y'\n", + writable_fptr_count, (writable_fptr_count == 1 ? "" : "s")); + return error_occurred ? 1 : 0; } diff --git a/security/Kconfig b/security/Kconfig index 285f284dfcac..5ab4df72aedd 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -9,7 +9,7 @@ source "security/keys/Kconfig" config SECURITY_DMESG_RESTRICT bool "Restrict unprivileged access to the kernel syslog" - default n + default y help This enforces restrictions on unprivileged users reading the kernel syslog via dmesg(8). @@ -56,6 +56,7 @@ config MSEAL_SYSTEM_MAPPINGS depends on 64BIT depends on ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS depends on !CHECKPOINT_RESTORE + default y help Apply mseal on system mappings. The system mappings includes vdso, vvar, vvar_vclock, @@ -72,10 +73,34 @@ config MSEAL_SYSTEM_MAPPINGS For complete descriptions of memory sealing, please see Documentation/userspace-api/mseal.rst +config SECURITY_PERF_EVENTS_RESTRICT + bool "Restrict unprivileged use of performance events" + depends on PERF_EVENTS + default y + help + If you say Y here, the kernel.perf_event_paranoid sysctl + will be set to 3 by default, and no unprivileged use of the + perf_event_open syscall will be permitted unless it is + changed. + +config SECURITY_TIOCSTI_RESTRICT + bool "Restrict unprivileged use of tiocsti command injection" + default y + help + This enforces restrictions on unprivileged users injecting commands + into other processes which share a tty session using the TIOCSTI + ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. + + If this option is not selected, no restrictions will be enforced + unless the tiocsti_restrict sysctl is explicitly set to (1). + + If you are unsure how to answer this question, answer N. + config SECURITY bool "Enable different security models" depends on SYSFS depends on MULTIUSER + default y help This allows you to choose different security modules to be configured into your kernel. @@ -101,6 +126,7 @@ config SECURITYFS config SECURITY_NETWORK bool "Socket and Networking Security Hooks" depends on SECURITY + default y help This enables the socket and networking security hooks. If enabled, a security module can use these hooks to diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index 86f8768c63d4..0068460db967 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -158,6 +158,7 @@ config KSTACK_ERASE_RUNTIME_DISABLE config INIT_ON_ALLOC_DEFAULT_ON bool "Enable heap memory zeroing on allocation by default" + default yes depends on !KMSAN help This has the effect of setting "init_on_alloc=1" on the kernel @@ -171,6 +172,7 @@ config INIT_ON_ALLOC_DEFAULT_ON config INIT_ON_FREE_DEFAULT_ON bool "Enable heap memory zeroing on free by default" + default yes depends on !KMSAN help This has the effect of setting "init_on_free=1" on the kernel @@ -209,6 +211,21 @@ config ZERO_CALL_USED_REGS be evaluated for suitability. For example, x86_64 grows by less than 1%, and arm64 grows by about 5%. +config PAGE_SANITIZE_VERIFY + bool "Verify sanitized pages" + default y + help + When init_on_free is enabled, verify that newly allocated pages + are zeroed to detect write-after-free bugs. + +config SLAB_SANITIZE_VERIFY + bool "Verify sanitized SLAB allocations" + default y + depends on !KASAN + help + When init_on_free is enabled, verify that newly allocated slab + objects are zeroed to detect write-after-free bugs. + endmenu menu "Bounds checking" @@ -218,6 +235,7 @@ config FORTIFY_SOURCE depends on ARCH_HAS_FORTIFY_SOURCE # https://github.com/llvm/llvm-project/issues/53645 depends on !X86_32 || !CC_IS_CLANG || CLANG_VERSION >= 160000 + default y help Detect overflows of buffers in common string and memory functions where the compiler can determine and validate the buffer sizes. @@ -225,6 +243,7 @@ config FORTIFY_SOURCE config HARDENED_USERCOPY bool "Harden memory copies between kernel and userspace" imply STRICT_DEVMEM + default y help This option checks for obviously wrong memory regions when copying memory to/from the kernel (via copy_to_user() and @@ -248,6 +267,7 @@ menu "Hardening of kernel data structures" config LIST_HARDENED bool "Check integrity of linked list manipulation" + default y help Minimal integrity checking in the linked-list manipulation routines to catch memory corruptions that are not guaranteed to result in an @@ -268,6 +288,7 @@ config RUST_BITMAP_HARDENED config BUG_ON_DATA_CORRUPTION bool "Trigger a BUG when data corruption is detected" select LIST_HARDENED + default y help Select this option if the kernel should BUG when it encounters data corruption in kernel memory structures when they get checked diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig index 5588c4d573f6..567a33eae460 100644 --- a/security/selinux/Kconfig +++ b/security/selinux/Kconfig @@ -3,7 +3,7 @@ config SECURITY_SELINUX bool "SELinux Support" depends on SECURITY_NETWORK && AUDIT && NET && INET select NETWORK_SECMARK - default n + default y help This selects Security-Enhanced Linux (SELinux). You will also need a policy configuration and a labeled filesystem. diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d053ce562370..00b3def71f48 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -141,18 +141,6 @@ static int __init selinux_enabled_setup(char *str) __setup("selinux=", selinux_enabled_setup); #endif -static int __init checkreqprot_setup(char *str) -{ - unsigned long checkreqprot; - - if (!kstrtoul(str, 0, &checkreqprot)) { - if (checkreqprot) - pr_err("SELinux: checkreqprot set to 1 via kernel parameter. This is no longer supported.\n"); - } - return 1; -} -__setup("checkreqprot=", checkreqprot_setup); - /** * selinux_secmark_enabled - Check to see if SECMARK is currently enabled * diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 896acad1f5f7..bfda407334f0 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -709,20 +709,12 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, if (IS_ERR(page)) return PTR_ERR(page); - if (sscanf(page, "%u", &new_value) != 1) { + if (sscanf(page, "%u", &new_value) != 1 || new_value) { length = -EINVAL; goto out; } length = count; - if (new_value) { - char comm[sizeof(current->comm)]; - - strscpy(comm, current->comm); - pr_err("SELinux: %s (%d) set checkreqprot to 1. This is no longer supported.\n", - comm, current->pid); - } - selinux_ima_measure_state(); out: diff --git a/security/yama/Kconfig b/security/yama/Kconfig index a810304123ca..b809050b25d2 100644 --- a/security/yama/Kconfig +++ b/security/yama/Kconfig @@ -2,7 +2,7 @@ config SECURITY_YAMA bool "Yama support" depends on SECURITY - default n + default y help This selects Yama, which extends DAC support with additional system-wide security settings beyond regular Linux discretionary diff --git a/tools/perf/Documentation/security.txt b/tools/perf/Documentation/security.txt index 4fe3b8b1958f..a7d88cc23a70 100644 --- a/tools/perf/Documentation/security.txt +++ b/tools/perf/Documentation/security.txt @@ -148,6 +148,7 @@ Perf tool provides a message similar to the one below: >= 0: Disallow raw and ftrace function tracepoint access >= 1: Disallow CPU event access >= 2: Disallow kernel profiling + >= 3: Disallow use of any event To make the adjusted perf_event_paranoid setting permanent preserve it in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = ) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9cd706f62793..524d1dbcf009 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -3898,6 +3898,7 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target, ">= 0: Disallow raw and ftrace function tracepoint access\n" ">= 1: Disallow CPU event access\n" ">= 2: Disallow kernel profiling\n" + ">= 3: Disallow use of any event\n" "To make the adjusted perf_event_paranoid setting permanent preserve it\n" "in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = )", perf_event_paranoid()); -- 2.53.0