From 42a8f333f84ee477fe547022fefd9ce6140c711a Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sun, 14 Apr 2024 12:15:57 +0200 Subject: [PATCH] hardened Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 16 +- Documentation/admin-guide/sysctl/kernel.rst | 22 ++ Documentation/networking/ip-sysctl.rst | 18 ++ arch/arm64/Kconfig | 2 + arch/arm64/configs/defconfig | 1 - arch/arm64/include/asm/elf.h | 14 +- arch/x86/Kconfig | 8 +- arch/x86/configs/x86_64_defconfig | 1 - arch/x86/entry/vdso/vma.c | 48 +--- arch/x86/include/asm/elf.h | 15 +- arch/x86/include/asm/tlbflush.h | 1 + arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/process.c | 8 +- arch/x86/kernel/sys_x86_64.c | 7 - arch/x86/mm/init_32.c | 4 +- arch/x86/mm/init_64.c | 4 +- block/blk-mq.c | 2 +- drivers/ata/libata-core.c | 2 +- drivers/char/Kconfig | 2 - drivers/tty/Kconfig | 2 - drivers/tty/tty_io.c | 18 ++ drivers/usb/core/Makefile | 1 + drivers/usb/core/hub.c | 6 + drivers/usb/core/sysctl.c | 36 +++ drivers/usb/core/usb.c | 9 + fs/exec.c | 3 + fs/inode.c | 13 + fs/namei.c | 8 +- fs/nfs/Kconfig | 1 - fs/overlayfs/Kconfig | 16 ++ fs/overlayfs/super.c | 2 + fs/proc/Kconfig | 1 - fs/stat.c | 19 +- include/linux/cache.h | 2 + include/linux/capability.h | 5 + include/linux/fs.h | 11 + include/linux/fsnotify.h | 3 + include/linux/highmem.h | 7 + include/linux/interrupt.h | 4 +- include/linux/kobject_ns.h | 2 +- include/linux/perf_event.h | 8 + include/linux/sysctl.h | 2 + include/linux/tty.h | 2 + include/linux/usb.h | 11 + include/net/tcp.h | 1 + init/Kconfig | 16 +- io_uring/io_uring.c | 5 +- kernel/audit.c | 3 + kernel/bpf/core.c | 2 +- kernel/capability.c | 6 + kernel/events/core.c | 7 +- kernel/fork.c | 5 +- kernel/printk/sysctl.c | 9 - kernel/rcu/tiny.c | 2 +- kernel/rcu/tree.c | 2 +- kernel/sched/fair.c | 2 +- kernel/softirq.c | 17 +- kernel/sysctl.c | 42 +++- kernel/time/hrtimer.c | 2 +- kernel/time/timer.c | 2 +- lib/Kconfig.debug | 9 +- lib/Kconfig.kfence | 9 + lib/irq_poll.c | 2 +- lib/kobject.c | 4 +- lib/nlattr.c | 2 + lib/vsprintf.c | 2 +- mm/Kconfig | 26 +- mm/Kconfig.debug | 3 +- mm/internal.h | 3 + mm/kfence/report.c | 5 + mm/mm_init.c | 3 + mm/mmap.c | 7 + mm/page_alloc.c | 35 ++- mm/slab.h | 21 +- mm/slab_common.c | 6 +- mm/slub.c | 234 +++++++++++++++--- mm/util.c | 4 +- net/core/dev.c | 4 +- net/ipv4/Kconfig | 24 ++ net/ipv4/sysctl_net_ipv4.c | 9 + net/ipv4/tcp_input.c | 3 +- scripts/Makefile.modpost | 1 + scripts/gcc-plugins/Kconfig | 5 + scripts/mod/modpost.c | 33 ++- security/Kconfig | 29 ++- security/Kconfig.hardening | 19 ++ security/selinux/Kconfig | 2 +- security/selinux/hooks.c | 12 - security/selinux/selinuxfs.c | 10 +- security/yama/Kconfig | 2 +- tools/perf/Documentation/security.txt | 1 + tools/perf/util/evsel.c | 1 + 92 files changed, 776 insertions(+), 245 deletions(-) create mode 100644 drivers/usb/core/sysctl.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 71b26acea314..172a64b5dd7a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -606,17 +606,6 @@ nokmem -- Disable kernel memory accounting. nobpf -- Disable BPF memory accounting. - checkreqprot= [SELINUX] Set initial checkreqprot flag value. - Format: { "0" | "1" } - See security/selinux/Kconfig help text. - 0 -- check protection applied by kernel (includes - any implied execute protection). - 1 -- check protection requested by application. - Default value is set via a kernel config option. - Value can be changed at runtime via - /sys/fs/selinux/checkreqprot. - Setting checkreqprot to 1 is deprecated. - cio_ignore= [S390] See Documentation/arch/s390/common_io.rst for details. @@ -4335,6 +4324,11 @@ the specified number of seconds. This is to be used if your oopses keep scrolling off the screen. + extra_latent_entropy + Enable a very simple form of latent entropy extraction + from the first 4GB of memory as the bootmem allocator + passes the memory pages to the buddy allocator. + pcbit= [HW,ISDN] pci=option[,option...] [PCI] various PCI subsystem options. diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 6584a1f9bfe3..bc1b0af96b33 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -930,6 +930,8 @@ with respect to CAP_PERFMON use cases. >=1 Disallow CPU event access by users without ``CAP_PERFMON``. >=2 Disallow kernel profiling by users without ``CAP_PERFMON``. + +>=3 Disallow use of any event by users without ``CAP_PERFMON``. === ================================================================== @@ -1509,6 +1511,26 @@ If a value outside of this range is written to ``threads-max`` an ``EINVAL`` error occurs. +tiocsti_restrict +================ + +This toggle indicates whether unprivileged users are prevented from using the +``TIOCSTI`` ioctl to inject commands into other processes which share a tty +session. + += ============================================================================ +0 No restriction, except the default one of only being able to inject commands + into one's own tty. +1 Users must have ``CAP_SYS_ADMIN`` to use the ``TIOCSTI`` ioctl. += ============================================================================ + +When user namespaces are in use, the check for ``CAP_SYS_ADMIN`` is done +against the user namespace that originally opened the tty. + +The kernel config option ``CONFIG_SECURITY_TIOCSTI_RESTRICT`` sets the default +value of ``tiocsti_restrict``. + + traceoff_on_warning =================== diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 7afff42612e9..af3a017de419 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -752,6 +752,24 @@ tcp_backlog_ack_defer - BOOLEAN Default : true +tcp_simult_connect - BOOLEAN + Enable TCP simultaneous connect that adds a weakness in Linux's strict + implementation of TCP that allows two clients to connect to each other + without either entering a listening state. The weakness allows an attacker + to easily prevent a client from connecting to a known server provided the + source port for the connection is guessed correctly. + + As the weakness could be used to prevent an antivirus or IPS from fetching + updates, or prevent an SSL gateway from fetching a CRL, it should be + eliminated by disabling this option. Though Linux is one of few operating + systems supporting simultaneous connect, it has no legitimate use in + practice and is rarely supported by firewalls. + + Disabling this may break TCP STUNT which is used by some applications for + NAT traversal. + + Default: Value of CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON + tcp_slow_start_after_idle - BOOLEAN If set, provide RFC2861 behavior and time out the congestion window after an idle period. An idle period is defined at diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6b96d75a3a3d..bd58e34bd0c2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1605,6 +1605,7 @@ config RODATA_FULL_DEFAULT_ENABLED config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + default y help Enabling this option prevents the kernel from accessing user-space memory directly by pointing TTBR0_EL1 to a reserved @@ -2181,6 +2182,7 @@ config RELOCATABLE config RANDOMIZE_BASE bool "Randomize the address of the kernel image" select RELOCATABLE + default y help Randomizes the virtual address at which the kernel image is loaded, as a security feature that deters exploit attempts diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index e6cf3e5d63c3..f0b6cc89ec47 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1,4 +1,3 @@ -CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_AUDIT=y CONFIG_NO_HZ_IDLE=y diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 97932fbf973d..40c6468c093a 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -124,14 +124,10 @@ /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is above 4GB to leave the entire 32-bit address + * 64-bit, this is raised to 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ -#ifdef CONFIG_ARM64_FORCE_52BIT -#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) -#else -#define ELF_ET_DYN_BASE (2 * DEFAULT_MAP_WINDOW_64 / 3) -#endif /* CONFIG_ARM64_FORCE_52BIT */ +#define ELF_ET_DYN_BASE 0x100000000UL #ifndef __ASSEMBLY__ @@ -189,10 +185,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, /* 1GB of VA */ #ifdef CONFIG_COMPAT #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ - 0x7ff >> (PAGE_SHIFT - 12) : \ - 0x3ffff >> (PAGE_SHIFT - 12)) + ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ + ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) #else -#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) +#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) #endif #ifdef __AARCH64EB__ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 73f526ddc8dc..d80f4e159ce6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1224,8 +1224,7 @@ config VM86 default X86_LEGACY_VM86 config X86_16BIT - bool "Enable support for 16-bit segments" if EXPERT - default y + bool "Enable support for 16-bit segments" depends on MODIFY_LDT_SYSCALL help This option is required by programs like Wine to run 16-bit @@ -2303,7 +2302,7 @@ config COMPAT_VDSO choice prompt "vsyscall table for legacy applications" depends on X86_64 - default LEGACY_VSYSCALL_XONLY + default LEGACY_VSYSCALL_NONE help Legacy user code that does not know how to find the vDSO expects to be able to issue three syscalls by calling fixed addresses in @@ -2387,8 +2386,7 @@ config CMDLINE_OVERRIDE be set to 'N' under normal conditions. config MODIFY_LDT_SYSCALL - bool "Enable the LDT (local descriptor table)" if EXPERT - default y + bool "Enable the LDT (local descriptor table)" help Linux can allow user programs to install a per-process x86 Local Descriptor Table (LDT) using the modify_ldt(2) system diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 61e25f6209ed..c95144785660 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,5 +1,4 @@ CONFIG_WERROR=y -CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_AUDIT=y CONFIG_NO_HZ=y diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 7645730dc228..96c818d3749d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -275,55 +275,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } #ifdef CONFIG_X86_64 -/* - * Put the vdso above the (randomized) stack with another randomized - * offset. This way there is no hole in the middle of address space. - * To save memory make sure it is still in the same PTE as the stack - * top. This doesn't give that many random bits. - * - * Note that this algorithm is imperfect: the distribution of the vdso - * start address within a PMD is biased toward the end. - * - * Only used for the 64-bit and x32 vdsos. - */ -static unsigned long vdso_addr(unsigned long start, unsigned len) -{ - unsigned long addr, end; - unsigned offset; - - /* - * Round up the start address. It can start out unaligned as a result - * of stack start randomization. - */ - start = PAGE_ALIGN(start); - - /* Round the lowest possible end address up to a PMD boundary. */ - end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= DEFAULT_MAP_WINDOW) - end = DEFAULT_MAP_WINDOW; - end -= len; - - if (end > start) { - offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); - addr = start + (offset << PAGE_SHIFT); - } else { - addr = start; - } - - /* - * Forcibly align the final address in case we have a hardware - * issue that requires alignment for performance reasons. - */ - addr = align_vdso_addr(addr); - - return addr; -} - static int map_vdso_randomized(const struct vdso_image *image) { - unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); - - return map_vdso(image, addr); + return map_vdso(image, 0); } #endif diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1e16bd5ac781..74f31be6f4c4 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -232,11 +232,11 @@ extern int force_personality32; /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is above 4GB to leave the entire 32-bit address + * 64-bit, this is raised to 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ - (DEFAULT_MAP_WINDOW / 3 * 2)) + 0x100000000UL) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, @@ -318,8 +318,8 @@ extern unsigned long get_sigframe_size(void); #ifdef CONFIG_X86_32 -#define __STACK_RND_MASK(is32bit) (0x7ff) -#define STACK_RND_MASK (0x7ff) +#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) +#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) #define ARCH_DLINFO ARCH_DLINFO_IA32 @@ -328,7 +328,11 @@ extern unsigned long get_sigframe_size(void); #else /* CONFIG_X86_32 */ /* 1GB for 64bit, 8MB for 32bit */ -#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) +#ifdef CONFIG_COMPAT +#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) +#else +#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) +#endif #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) #define ARCH_DLINFO \ @@ -392,5 +396,4 @@ struct va_alignment { } ____cacheline_aligned; extern struct va_alignment va_align; -extern unsigned long align_vdso_addr(unsigned long); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 25726893c6f4..cd4b5781260d 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -421,6 +421,7 @@ static inline void set_tlbstate_lam_mode(struct mm_struct *mm) static inline void __native_tlb_flush_global(unsigned long cr4) { + BUG_ON(cr4 != __read_cr4()); native_write_cr4(cr4 ^ X86_CR4_PGE); native_write_cr4(cr4); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 785fedddb5f0..233c72d96697 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -432,6 +432,7 @@ EXPORT_SYMBOL_GPL(native_write_cr4); void cr4_update_irqsoff(unsigned long set, unsigned long clear) { unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + BUG_ON(cr4 != __read_cr4()); lockdep_assert_irqs_disabled(); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ab49ade31b0d..38f180550089 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -52,6 +52,8 @@ #include #include #include +#include +#include #include "process.h" @@ -681,6 +683,7 @@ void speculation_ctrl_update_current(void) static inline void cr4_toggle_bits_irqsoff(unsigned long mask) { unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + BUG_ON(cr4 != __read_cr4()); newval = cr4 ^ mask; if (newval != cr4) { @@ -1030,7 +1033,10 @@ unsigned long arch_align_stack(unsigned long sp) unsigned long arch_randomize_brk(struct mm_struct *mm) { - return randomize_page(mm->brk, 0x02000000); + if (mmap_is_ia32()) + return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; + else + return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; } /* diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index c783aeb37dce..cb9fa1d5c66f 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -52,13 +52,6 @@ static unsigned long get_align_bits(void) return va_align.bits & get_align_mask(); } -unsigned long align_vdso_addr(unsigned long addr) -{ - unsigned long align_mask = get_align_mask(); - addr = (addr + align_mask) & ~align_mask; - return addr | get_align_bits(); -} - static int __init control_va_addr_alignment(char *str) { /* guard against enabling this on other CPU families */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b63403d7179d..a40bcad396d9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -516,9 +516,9 @@ static void __init pagetable_init(void) #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) /* Bits supported by the hardware: */ -pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; +pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK; /* Bits allowed in normal kernel mappings: */ -pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; +pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK; EXPORT_SYMBOL_GPL(__supported_pte_mask); /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ EXPORT_SYMBOL(__default_kernel_pte_mask); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 534436c9d399..aef8e70a2fc8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -104,9 +104,9 @@ static inline pgprot_t prot_sethuge(pgprot_t prot) */ /* Bits supported by the hardware: */ -pteval_t __supported_pte_mask __read_mostly = ~0; +pteval_t __supported_pte_mask __ro_after_init = ~0; /* Bits allowed in normal kernel mappings: */ -pteval_t __default_kernel_pte_mask __read_mostly = ~0; +pteval_t __default_kernel_pte_mask __ro_after_init = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ EXPORT_SYMBOL(__default_kernel_pte_mask); diff --git a/block/blk-mq.c b/block/blk-mq.c index 25d2f3239eb6..ac3db3f08146 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1130,7 +1130,7 @@ static void blk_complete_reqs(struct llist_head *list) rq->q->mq_ops->complete(rq); } -static __latent_entropy void blk_done_softirq(struct softirq_action *h) +static __latent_entropy void blk_done_softirq(void) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index be3412cdb22e..23ab27b03a20 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4800,7 +4800,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) struct ata_port *ap; struct ata_link *link; - WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); ap = qc->ap; link = qc->dev->link; diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 7c8dd0abcfdf..db74705bb013 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -309,7 +309,6 @@ config NSC_GPIO config DEVMEM bool "/dev/mem virtual device support" - default y help Say Y here if you want to support the /dev/mem device. The /dev/mem device is used to access areas of physical @@ -342,7 +341,6 @@ config NVRAM config DEVPORT bool "/dev/port character device" depends on HAS_IOPORT - default y help Say Y here if you want to support the /dev/port device. The /dev/port device is similar to /dev/mem, but for I/O ports. diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig index 5646dc6242cd..513412df9691 100644 --- a/drivers/tty/Kconfig +++ b/drivers/tty/Kconfig @@ -121,7 +121,6 @@ config UNIX98_PTYS config LEGACY_PTYS bool "Legacy (BSD) PTY support" - default y help A pseudo terminal (PTY) is a software device consisting of two halves: a master and a slave. The slave device behaves identical to @@ -151,7 +150,6 @@ config LEGACY_PTY_COUNT config LEGACY_TIOCSTI bool "Allow legacy TIOCSTI usage" - default y help Historically the kernel has allowed TIOCSTI, which will push characters into a controlling TTY. This continues to be used diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 407b0d87b7c1..ad8a4a532e51 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -171,6 +171,7 @@ static void free_tty_struct(struct tty_struct *tty) tty_ldisc_deinit(tty); put_device(tty->dev); kvfree(tty->write_buf); + put_user_ns(tty->owner_user_ns); kfree(tty); } @@ -2265,6 +2266,7 @@ static int tty_fasync(int fd, struct file *filp, int on) } static bool tty_legacy_tiocsti __read_mostly = IS_ENABLED(CONFIG_LEGACY_TIOCSTI); +static int tty_tiocsti_restrict __read_mostly = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); /** * tiocsti - fake input character * @tty: tty to fake input into @@ -2286,6 +2288,12 @@ static int tiocsti(struct tty_struct *tty, u8 __user *p) if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN)) return -EIO; + if (tty_tiocsti_restrict && + !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { + dev_warn_ratelimited(tty->dev, + "Denied TIOCSTI ioctl for non-privileged process\n"); + return -EPERM; + } if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, p)) @@ -3144,6 +3152,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) tty->index = idx; tty_line_name(driver, idx, tty->name); tty->dev = tty_get_device(tty); + tty->owner_user_ns = get_user_ns(current_user_ns()); return tty; } @@ -3635,6 +3644,15 @@ static struct ctl_table tty_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "tiocsti_restrict", + .data = &tty_tiocsti_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; /* diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile index 7d338e9c0657..64ac3c36fa1a 100644 --- a/drivers/usb/core/Makefile +++ b/drivers/usb/core/Makefile @@ -11,6 +11,7 @@ usbcore-y += phy.o port.o usbcore-$(CONFIG_OF) += of.o usbcore-$(CONFIG_USB_PCI) += hcd-pci.o usbcore-$(CONFIG_ACPI) += usb-acpi.o +usbcore-$(CONFIG_SYSCTL) += sysctl.o ifdef CONFIG_USB_ONBOARD_HUB usbcore-y += ../misc/onboard_usb_hub_pdevs.o diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 64e54163f05e..e64646413a6e 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -5373,6 +5373,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, goto done; return; } + + if (deny_new_usb) { + dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); + goto done; + } + if (hub_is_superspeed(hub->hdev)) unit_load = 150; else diff --git a/drivers/usb/core/sysctl.c b/drivers/usb/core/sysctl.c new file mode 100644 index 000000000000..60ba11557131 --- /dev/null +++ b/drivers/usb/core/sysctl.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include +#include + +static struct ctl_table usb_sysctls[] = { + { + .procname = "deny_new_usb", + .data = &deny_new_usb, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static struct ctl_table_header *usb_sysctl_table; + +int usb_register_sysctl(void) +{ + usb_sysctl_table = register_sysctl("kernel", usb_sysctls); + if (!usb_sysctl_table) { + pr_warn("usb: sysctl registration failed\n"); + return -ENOMEM; + } + return 0; +} + +void usb_unregister_sysctl(void) +{ + unregister_sysctl_table(usb_sysctl_table); + usb_sysctl_table = NULL; +} diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index dc8d9228a5e7..ef69409afee8 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -72,6 +72,9 @@ MODULE_PARM_DESC(autosuspend, "default autosuspend delay"); #define usb_autosuspend_delay 0 #endif +int deny_new_usb __read_mostly = 0; +EXPORT_SYMBOL(deny_new_usb); + static bool match_endpoint(struct usb_endpoint_descriptor *epd, struct usb_endpoint_descriptor **bulk_in, struct usb_endpoint_descriptor **bulk_out, @@ -1080,6 +1083,9 @@ static int __init usb_init(void) usb_debugfs_init(); usb_acpi_register(); + retval = usb_register_sysctl(); + if (retval) + goto sysctl_init_failed; retval = bus_register(&usb_bus_type); if (retval) goto bus_register_failed; @@ -1119,6 +1125,8 @@ static int __init usb_init(void) bus_notifier_failed: bus_unregister(&usb_bus_type); bus_register_failed: + usb_unregister_sysctl(); +sysctl_init_failed: usb_acpi_unregister(); usb_debugfs_cleanup(); out: @@ -1143,6 +1151,7 @@ static void __exit usb_exit(void) class_unregister(&usbmisc_class); bus_unregister_notifier(&usb_bus_type, &usb_bus_nb); bus_unregister(&usb_bus_type); + usb_unregister_sysctl(); usb_acpi_unregister(); usb_debugfs_cleanup(); idr_destroy(&usb_bus_idr); diff --git a/fs/exec.c b/fs/exec.c index 5ee2545c3e18..0173819d90b1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -286,6 +287,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) mm->stack_vm = mm->total_vm = 1; mmap_write_unlock(mm); bprm->p = vma->vm_end - sizeof(void *); + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + bprm->p ^= get_random_u32() & ~PAGE_MASK; return 0; err: mmap_write_unlock(mm); diff --git a/fs/inode.c b/fs/inode.c index 91048c4c9c9e..2c1bf35318e6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -97,6 +97,10 @@ long get_nr_dirty_inodes(void) return nr_dirty > 0 ? nr_dirty : 0; } +/* sysctl */ +int device_sidechannel_restrict __read_mostly = 1; +EXPORT_SYMBOL(device_sidechannel_restrict); + /* * Handle nr_inode sysctl */ @@ -129,6 +133,15 @@ static struct ctl_table inodes_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_inodes, }, + { + .procname = "device_sidechannel_restrict", + .data = &device_sidechannel_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __init init_fs_inode_sysctls(void) diff --git a/fs/namei.c b/fs/namei.c index 9342fa6a38c2..46d2be75dafb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1028,10 +1028,10 @@ static inline void put_link(struct nameidata *nd) path_put(&last->link); } -static int sysctl_protected_symlinks __read_mostly; -static int sysctl_protected_hardlinks __read_mostly; -static int sysctl_protected_fifos __read_mostly; -static int sysctl_protected_regular __read_mostly; +static int sysctl_protected_symlinks __read_mostly = 1; +static int sysctl_protected_hardlinks __read_mostly = 1; +static int sysctl_protected_fifos __read_mostly = 2; +static int sysctl_protected_regular __read_mostly = 2; #ifdef CONFIG_SYSCTL static struct ctl_table namei_sysctls[] = { diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f7e32d76e34d..1b8188a34dcd 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -195,7 +195,6 @@ config NFS_DEBUG bool depends on NFS_FS && SUNRPC_DEBUG select CRC32 - default y config NFS_DISABLE_UDP_SUPPORT bool "NFS: Disable NFS UDP protocol support" diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 2ac67e04a6fb..3340e13c959c 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -134,3 +134,19 @@ config OVERLAY_FS_DEBUG Say Y here to enable extra debugging checks in overlayfs. If unsure, say N. + +config OVERLAY_FS_UNPRIVILEGED + bool "Overlayfs: turn on unprivileged user namespace mounts" + default n + depends on OVERLAY_FS + help + When disabled, unprivileged users will not be able to create + new overlayfs mounts. This cuts the attack surface if no + unprivileged user namespace mounts are required like for + running rootless containers. + + Overlayfs has been part of several recent local privilege + escalation exploits, so if you are security-conscious + you want to disable this. + + If unsure, say N. diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 2eef6c70b2ae..0e7bd050a576 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1492,7 +1492,9 @@ struct file_system_type ovl_fs_type = { .name = "overlay", .init_fs_context = ovl_init_fs_context, .parameters = ovl_parameter_spec, +#ifdef CONFIG_OVERLAY_FS_UNPRIVILEGED .fs_flags = FS_USERNS_MOUNT, +#endif .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("overlay"); diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 32b1116ae137..bb80ff84280d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -41,7 +41,6 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support" depends on PROC_FS && CRASH_DUMP - default y help Exports the dump image of crashed kernel in ELF format. diff --git a/fs/stat.c b/fs/stat.c index 77cdc69eb422..fcfed49753cd 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -57,8 +57,13 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); - stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); + if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { + stat->atime = inode_get_ctime(inode); + stat->mtime = inode_get_ctime(inode); + } else { + stat->atime = inode_get_atime(inode); + stat->mtime = inode_get_mtime(inode); + } stat->ctime = inode_get_ctime(inode); stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; @@ -131,10 +136,16 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, STATX_ATTR_DAX); idmap = mnt_idmap(path->mnt); - if (inode->i_op->getattr) - return inode->i_op->getattr(idmap, path, stat, + if (inode->i_op->getattr) { + int retval = inode->i_op->getattr(idmap, path, stat, request_mask, query_flags | AT_GETATTR_NOSEC); + if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { + stat->atime = stat->ctime; + stat->mtime = stat->ctime; + } + return retval; + } generic_fillattr(idmap, request_mask, inode, stat); return 0; diff --git a/include/linux/cache.h b/include/linux/cache.h index 0ecb17bb6883..fd3d17033967 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -37,6 +37,8 @@ #define __ro_after_init __section(".data..ro_after_init") #endif +#define __read_only __ro_after_init + #ifndef ____cacheline_aligned #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) #endif diff --git a/include/linux/capability.h b/include/linux/capability.h index 0c356a517991..1f7176393417 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -146,6 +146,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); +extern bool capable_noaudit(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); @@ -172,6 +173,10 @@ static inline bool capable(int cap) { return true; } +static inline bool capable_noaudit(int cap) +{ + return true; +} static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; diff --git a/include/linux/fs.h b/include/linux/fs.h index 08ecac9d7b8b..4fac292f0700 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3534,4 +3534,15 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +extern int device_sidechannel_restrict; + +static inline bool is_sidechannel_device(const struct inode *inode) +{ + umode_t mode; + if (!device_sidechannel_restrict) + return false; + mode = inode->i_mode; + return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); +} + #endif /* _LINUX_FS_H */ diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 8300a5286988..6c751e0585fa 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -97,6 +97,9 @@ static inline int fsnotify_file(struct file *file, __u32 mask) return 0; path = &file->f_path; + if (mask & (FS_ACCESS | FS_MODIFY) && is_sidechannel_device(file_inode(file))) + return 0; + return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 451c1dff0e87..ce882b45248f 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -257,6 +257,13 @@ static inline void tag_clear_highpage(struct page *page) #endif +static inline void verify_zero_highpage(struct page *page) +{ + void *kaddr = kmap_atomic(page); + BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); + kunmap_atomic(kaddr); +} + /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5c9bdd3ffccc..1b537a779f63 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -590,7 +590,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; struct softirq_action { - void (*action)(struct softirq_action *); + void (*action)(void); }; asmlinkage void do_softirq(void); @@ -605,7 +605,7 @@ static inline void do_softirq_post_smp_call_flush(unsigned int unused) } #endif -extern void open_softirq(int nr, void (*action)(struct softirq_action *)); +extern void __init open_softirq(int nr, void (*action)(void)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index be707748e7ce..40b0d6710f10 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -45,7 +45,7 @@ struct kobj_ns_type_operations { void (*drop_ns)(void *); }; -int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); +int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); int kobj_ns_type_registered(enum kobj_ns_type type); const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *parent); const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d2a15c0c6f8a..0330a6b4a7da 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1618,6 +1618,14 @@ static inline int perf_is_paranoid(void) return sysctl_perf_event_paranoid > -1; } +static inline int perf_allow_open(struct perf_event_attr *attr) +{ + if (sysctl_perf_event_paranoid > 2 && !perfmon_capable()) + return -EACCES; + + return security_perf_event_open(attr, PERF_SECURITY_OPEN); +} + static inline int perf_allow_kernel(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index ee7d33b89e9e..f2bb8cf40642 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -74,6 +74,8 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dou8vec_minmax(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/include/linux/tty.h b/include/linux/tty.h index 8c76fd97d4ad..09887bf51b82 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -15,6 +15,7 @@ #include #include #include +#include /* @@ -248,6 +249,7 @@ struct tty_struct { #define N_TTY_BUF_SIZE 4096 struct work_struct SAK_work; + struct user_namespace *owner_user_ns; } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ diff --git a/include/linux/usb.h b/include/linux/usb.h index 9e52179872a5..27a7948ba62d 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -2053,6 +2053,17 @@ extern void usb_led_activity(enum usb_led_event ev); static inline void usb_led_activity(enum usb_led_event ev) {} #endif +/* sysctl.c */ +extern int deny_new_usb; +#ifdef CONFIG_SYSCTL +extern int usb_register_sysctl(void); +extern void usb_unregister_sysctl(void); +#else +static inline int usb_register_sysctl(void) { return 0; } +static inline void usb_unregister_sysctl(void) { } +#endif /* CONFIG_SYSCTL */ + + #endif /* __KERNEL__ */ #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 3998a5f145ad..c33a939193ae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -257,6 +257,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; +extern int sysctl_tcp_simult_connect; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ diff --git a/init/Kconfig b/init/Kconfig index 9ea39297f149..b0323e793162 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -462,6 +462,7 @@ config USELIB config AUDIT bool "Auditing support" depends on NET + default y help Enable auditing infrastructure that can be used with another kernel subsystem, such as SELinux (which requires this for @@ -1253,8 +1254,8 @@ config USER_NS config USER_NS_UNPRIVILEGED bool "Allow unprivileged users to create namespaces" - default y depends on USER_NS + default n help When disabled, unprivileged users will not be able to create new namespaces. Allowing users to create their own namespaces @@ -1265,7 +1266,7 @@ config USER_NS_UNPRIVILEGED This setting can be overridden at runtime via the kernel.unprivileged_userns_clone sysctl. - If unsure, say Y. + If unsure, say N. config PID_NS bool "PID Namespaces" @@ -1508,9 +1509,8 @@ menuconfig EXPERT Only use this if you really know what you are doing. config UID16 - bool "Enable 16-bit UID system calls" if EXPERT + bool "Enable 16-bit UID system calls" depends on HAVE_UID16 && MULTIUSER - default y help This enables the legacy 16-bit UID syscall wrappers. @@ -1539,14 +1539,13 @@ config SGETMASK_SYSCALL If unsure, leave the default option here. config SYSFS_SYSCALL - bool "Sysfs syscall support" if EXPERT - default y + bool "Sysfs syscall support" help sys_sysfs is an obsolete system call no longer supported in libc. Note that disabling this option is more secure but might break compatibility with some systems. - If unsure say Y here. + If unsure say N here. config FHANDLE bool "open by fhandle syscalls" if EXPERT @@ -1685,8 +1684,7 @@ config SHMEM which may be appropriate on small systems without swap. config AIO - bool "Enable AIO support" if EXPERT - default y + bool "Enable AIO support" help This option enables POSIX asynchronous I/O which may by used by some high performance threaded applications. Disabling diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0f52ea80103e..5b2399d0126f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -154,7 +154,7 @@ static void io_queue_sqe(struct io_kiocb *req); struct kmem_cache *req_cachep; static struct workqueue_struct *iou_wq __ro_after_init; -static int __read_mostly sysctl_io_uring_disabled; +static int __read_mostly sysctl_io_uring_disabled = 1; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL @@ -164,8 +164,9 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { .data = &sysctl_io_uring_disabled, .maxlen = sizeof(sysctl_io_uring_disabled), .mode = 0644, + /* only handle a transition from default "1" to "2" */ .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = SYSCTL_TWO, .extra2 = SYSCTL_TWO, }, { diff --git a/kernel/audit.c b/kernel/audit.c index 9c8e5f732c4c..72c7a561599c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1744,6 +1744,9 @@ static int __init audit_enable(char *str) if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; + else if (!audit_ever_enabled) + audit_initialized = AUDIT_UNINITIALIZED; + if (audit_set_enabled(audit_default)) pr_err("audit: error setting audit state (%d)\n", audit_default); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 026627226ec4..4693ef1bbf83 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -560,7 +560,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); -int bpf_jit_harden __read_mostly; +int bpf_jit_harden __read_mostly = 2; long bpf_jit_limit __read_mostly; long bpf_jit_limit_max __read_mostly; diff --git a/kernel/capability.c b/kernel/capability.c index dac4df77e376..68bb3279b14f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -436,6 +436,12 @@ bool capable(int cap) return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); + +bool capable_noaudit(int cap) +{ + return ns_capable_noaudit(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable_noaudit); #endif /* CONFIG_MULTIUSER */ /** diff --git a/kernel/events/core.c b/kernel/events/core.c index f0f0f71213a1..f2a813fcea86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -415,8 +415,13 @@ static struct kmem_cache *perf_event_cache; * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv + * 3 - disallow all unpriv perf event use */ +#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT +int sysctl_perf_event_paranoid __read_mostly = 3; +#else int sysctl_perf_event_paranoid __read_mostly = 2; +#endif /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ @@ -12445,7 +12450,7 @@ SYSCALL_DEFINE5(perf_event_open, return err; /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = perf_allow_open(&attr); if (err) return err; diff --git a/kernel/fork.c b/kernel/fork.c index 5449c990a91a..bc4eced5e8e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -82,6 +82,7 @@ #include #include #include +#include #include #include #include @@ -102,10 +103,6 @@ #include #include -#ifdef CONFIG_USER_NS -#include -#endif - #include #include #include diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index c228343eeb97..c7129428ee9b 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -11,15 +11,6 @@ static const int ten_thousand = 10000; -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} - static struct ctl_table printk_sysctls[] = { { .procname = "printk", diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index fec804b79080..106125f3b34d 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -105,7 +105,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) } /* Invoke the RCU callbacks whose grace period has elapsed. */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(void) { struct rcu_head *next, *list; unsigned long flags; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f2c10d351b59..d264618f41ec 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2477,7 +2477,7 @@ static __latent_entropy void rcu_core(void) queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } -static void rcu_core_si(struct softirq_action *h) +static void rcu_core_si(void) { rcu_core(); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 97983b041e9d..e37db29fe90c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12437,7 +12437,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */ -static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void run_rebalance_domains(void) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? diff --git a/kernel/softirq.c b/kernel/softirq.c index 210cf5f8d92c..ebb995221f91 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -56,7 +56,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); #endif -static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; +static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); DEFINE_PER_CPU(struct task_struct *, ksoftirqd); @@ -550,7 +550,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) kstat_incr_softirqs_this_cpu(vec_nr); trace_softirq_entry(vec_nr); - h->action(h); + h->action(); trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", @@ -695,7 +695,7 @@ void __raise_softirq_irqoff(unsigned int nr) or_softirq_pending(1UL << nr); } -void open_softirq(int nr, void (*action)(struct softirq_action *)) +void __init open_softirq(int nr, void (*action)(void)) { softirq_vec[nr].action = action; } @@ -755,8 +755,7 @@ static bool tasklet_clear_sched(struct tasklet_struct *t) return false; } -static void tasklet_action_common(struct softirq_action *a, - struct tasklet_head *tl_head, +static void tasklet_action_common(struct tasklet_head *tl_head, unsigned int softirq_nr) { struct tasklet_struct *list; @@ -800,14 +799,14 @@ static void tasklet_action_common(struct softirq_action *a, } } -static __latent_entropy void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(void) { - tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); + tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); } -static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(void) { - tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); + tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } void tasklet_setup(struct tasklet_struct *t, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c92d8a4b23fb..2411555d9cdc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -80,6 +80,9 @@ #ifdef CONFIG_RT_MUTEXES #include #endif +#ifdef CONFIG_USER_NS +#include +#endif /* shared constants to be used in various sysctls */ const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; @@ -95,9 +98,6 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; #endif -#ifdef CONFIG_USER_NS -#include -#endif static const int ngroups_max = NGROUPS_MAX; @@ -811,6 +811,35 @@ static int proc_taint(struct ctl_table *table, int write, return err; } +/** + * proc_dointvec_minmax_sysadmin - read a vector of integers with min/max values + * checking CAP_SYS_ADMIN on write + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Writing is only allowed when the current task has CAP_SYS_ADMIN. + * + * Returns 0 on success, -EPERM on permission failure or -EINVAL on write + * when the range check fails. + */ +int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} + /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value @@ -1543,6 +1572,12 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -2273,6 +2308,7 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL(proc_dointvec_minmax_sysadmin); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index edb0f821dcea..494516a675ba 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1757,7 +1757,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, } } -static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 352b161113cd..dad814b951e2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2044,7 +2044,7 @@ static inline void __run_timers(struct timer_base *base) /* * This function runs timers and the timer-tq in bottom half context. */ -static __latent_entropy void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(void) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ef36b829ae1f..540710015eed 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -509,6 +509,9 @@ config SECTION_MISMATCH_WARN_ONLY If unsure, say Y. +config DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE + bool "Enable verbose reporting of writable function pointers" + config DEBUG_FORCE_FUNCTION_ALIGN_64B bool "Force all function address 64B aligned" depends on EXPERT && (X86_64 || ARM64 || PPC32 || PPC64 || ARC || RISCV || S390) @@ -647,7 +650,7 @@ config DEBUG_FS choice prompt "Debugfs default access" depends on DEBUG_FS - default DEBUG_FS_ALLOW_ALL + default DEBUG_FS_ALLOW_NONE help This selects the default access restrictions for debugfs. It can be overridden with kernel command line option @@ -987,6 +990,7 @@ menu "Debug Oops, Lockups and Hangs" config PANIC_ON_OOPS bool "Panic on Oops" + default y help Say Y here to enable the kernel to panic when it oopses. This has the same effect as setting oops=panic on the kernel command @@ -996,7 +1000,7 @@ config PANIC_ON_OOPS anything erroneous after an oops which could result in data corruption or other issues. - Say N if unsure. + Say Y if unsure. config PANIC_ON_OOPS_VALUE int @@ -1858,6 +1862,7 @@ config STRICT_DEVMEM config IO_STRICT_DEVMEM bool "Filter I/O access to /dev/mem" depends on STRICT_DEVMEM + default y help If this option is disabled, you allow userspace (root) access to all io-memory regardless of whether a driver is actively using that diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index 6fbbebec683a..e494618f7193 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -96,4 +96,13 @@ config KFENCE_KUNIT_TEST during boot; say M if you want the test to build as a module; say N if you are unsure. +config KFENCE_BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + default y + help + Select this option if the kernel should BUG when kfence encounters + data corruption of kfence managed objects after error report. + + If unsure, say Y. + endif # KFENCE diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 2d5329a42105..08b242bbdbdf 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) } EXPORT_SYMBOL(irq_poll_complete); -static void __latent_entropy irq_poll_softirq(struct softirq_action *h) +static void __latent_entropy irq_poll_softirq(void) { struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); int rearm = 0, budget = irq_poll_budget; diff --git a/lib/kobject.c b/lib/kobject.c index 72fa20f405f1..c5e9051f0925 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -1019,9 +1019,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); static DEFINE_SPINLOCK(kobj_ns_type_lock); -static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; +static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; -int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) +int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) { enum kobj_ns_type type = ops->type; int error; diff --git a/lib/nlattr.c b/lib/nlattr.c index be9c576b6e2d..484d839bcf5e 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -837,6 +837,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); + BUG_ON(minlen < 0); + memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 552738f14275..50a8a22fa3e7 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -847,7 +847,7 @@ static char *default_pointer(char *buf, char *end, const void *ptr, return ptr_to_id(buf, end, ptr, spec); } -int kptr_restrict __read_mostly; +int kptr_restrict __read_mostly = 2; static noinline_for_stack char *restricted_pointer(char *buf, char *end, const void *ptr, diff --git a/mm/Kconfig b/mm/Kconfig index 0e440573033c..a26e4605a831 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -260,7 +260,6 @@ config SLUB_TINY config SLAB_MERGE_DEFAULT bool "Allow slab caches to be merged" - default y help For reduced kernel memory fragmentation, slab caches can be merged when they share the same size and other characteristics. @@ -275,6 +274,7 @@ config SLAB_MERGE_DEFAULT config SLAB_FREELIST_RANDOM bool "Randomize slab freelist" depends on !SLUB_TINY + default y help Randomizes the freelist order used on creating new pages. This security feature reduces the predictability of the kernel slab @@ -283,12 +283,30 @@ config SLAB_FREELIST_RANDOM config SLAB_FREELIST_HARDENED bool "Harden slab freelist metadata" depends on !SLUB_TINY + default y help Many kernel heap attacks try to target slab cache metadata and other infrastructure. This options makes minor performance sacrifices to harden the kernel slab allocator against common freelist exploit methods. +config SLAB_CANARY + depends on SLUB + depends on !SLAB_MERGE_DEFAULT + bool "SLAB canaries" + default y + help + Place canaries at the end of kernel slab allocations, sacrificing + some performance and memory usage for security. + + Canaries can detect some forms of heap corruption when allocations + are freed and as part of the HARDENED_USERCOPY feature. It provides + basic use-after-free detection for HARDENED_USERCOPY. + + Canaries absorb small overflows (rendering them harmless), mitigate + non-NUL terminated C string overflows on 64-bit via a guaranteed zero + byte and provide basic double-free detection. + config SLUB_STATS default n bool "Enable performance statistics" @@ -314,7 +332,7 @@ config SLUB_CPU_PARTIAL Typically one would choose no for a realtime system. config RANDOM_KMALLOC_CACHES - default n + default y depends on !SLUB_TINY bool "Randomize slab caches for normal kmalloc" help @@ -358,7 +376,6 @@ config SHUFFLE_PAGE_ALLOCATOR config COMPAT_BRK bool "Disable heap randomization" - default y help Randomizing heap placement makes heap exploits harder, but it also breaks ancient binaries (including anything libc5 based). @@ -726,7 +743,8 @@ config KSM config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" depends on MMU - default 4096 + default 32768 if ARM || (ARM64 && COMPAT) + default 65536 help This is the portion of low virtual memory which should be protected from userspace allocation. Keeping a user from writing to low pages diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 321ab379994f..e88003133a9c 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -47,7 +47,7 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT config SLUB_DEBUG default y - bool "Enable SLUB debugging support" if EXPERT + bool "Enable SLUB debugging support" depends on SYSFS && !SLUB_TINY select STACKDEPOT if STACKTRACE_SUPPORT help @@ -156,6 +156,7 @@ config DEBUG_WX depends on ARCH_HAS_DEBUG_WX depends on MMU select PTDUMP_CORE + default y help Generate a warning if any W+X mappings are found at boot. diff --git a/mm/internal.h b/mm/internal.h index f309a010d50f..fb8723bcce0c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -424,6 +424,9 @@ static inline struct folio *page_rmappable_folio(struct page *page) return folio; } +extern void __init __gather_extra_latent_entropy(struct page *page, + unsigned int nr_pages); + static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; diff --git a/mm/kfence/report.c b/mm/kfence/report.c index c509aed326ce..096191be8706 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -272,6 +273,10 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r lockdep_on(); +#ifdef CONFIG_KFENCE_BUG_ON_DATA_CORRUPTION + BUG(); +#endif + check_panic_on_warn("KFENCE"); /* We encountered a memory safety error, taint the kernel! */ diff --git a/mm/mm_init.c b/mm/mm_init.c index 2c19f5515e36..1bd3aa2a6633 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1970,6 +1970,7 @@ static void __init deferred_free_range(unsigned long pfn, if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { for (i = 0; i < nr_pages; i += pageblock_nr_pages) set_pageblock_migratetype(page + i, MIGRATE_MOVABLE); + __gather_extra_latent_entropy(page, 1 << MAX_PAGE_ORDER); __free_pages_core(page, MAX_PAGE_ORDER); return; } @@ -1980,6 +1981,7 @@ static void __init deferred_free_range(unsigned long pfn, for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); + __gather_extra_latent_entropy(page, 1); __free_pages_core(page, 0); } } @@ -2577,6 +2579,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* KMSAN will take care of these pages. */ return; } + __gather_extra_latent_entropy(page, 1 << order); __free_pages_core(page, order); } diff --git a/mm/mmap.c b/mm/mmap.c index 3281287771c9..6c30817190b7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -218,6 +218,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); + /* properly handle unaligned min_brk as an empty heap */ + if (min_brk & ~PAGE_MASK) { + if (brk == min_brk) + newbrk -= PAGE_SIZE; + if (mm->brk == min_brk) + oldbrk -= PAGE_SIZE; + } if (oldbrk == newbrk) { mm->brk = brk; goto success; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7c48b114331b..2c70127529fa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -181,6 +181,15 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); static DEFINE_MUTEX(pcpu_drain_mutex); +bool __meminitdata extra_latent_entropy; + +static int __init setup_extra_latent_entropy(char *str) +{ + extra_latent_entropy = true; + return 0; +} +early_param("extra_latent_entropy", setup_extra_latent_entropy); + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); @@ -1285,6 +1294,25 @@ static void __free_pages_ok(struct page *page, unsigned int order, __count_vm_events(PGFREE, 1 << order); } +void __init __gather_extra_latent_entropy(struct page *page, + unsigned int nr_pages) +{ + if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { + unsigned long hash = 0; + size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; + const unsigned long *data = lowmem_page_address(page); + + for (index = 0; index < end; index++) + hash ^= hash + data[index]; +#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY + latent_entropy ^= hash; + add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); +#else + add_device_randomness((const void *)&hash, sizeof(hash)); +#endif + } +} + void __free_pages_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; @@ -1304,7 +1332,6 @@ void __free_pages_core(struct page *page, unsigned int order) } __ClearPageReserved(p); set_page_count(p, 0); - atomic_long_add(nr_pages, &page_zone(page)->managed_pages); if (page_contains_unaccepted(page, order)) { @@ -1499,6 +1526,12 @@ inline void post_alloc_hook(struct page *page, unsigned int order, */ kernel_unpoison_pages(page, 1 << order); + if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY) && want_init_on_free()) { + int i; + for (i = 0; i < (1 << order); i++) + verify_zero_highpage(page + i); + } + /* * As memory initialization might be integrated into KASAN, * KASAN unpoisoning and memory initializion code must be diff --git a/mm/slab.h b/mm/slab.h index 54deeb0428c6..a6d9a00e1ed7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -284,6 +284,11 @@ struct kmem_cache { unsigned long random; #endif +#ifdef CONFIG_SLAB_CANARY + unsigned long random_active; + unsigned long random_inactive; +#endif + #ifdef CONFIG_NUMA /* * Defragmentation by allocating from a remote node. @@ -597,7 +602,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) * back there or track user information then we can * only use the space before that information. */ - if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) return s->inuse; /* * Else we can use all the padding etc for the allocation @@ -632,8 +637,10 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) { +#ifndef CONFIG_SLUB if (c->ctor) return false; +#endif if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) return flags & __GFP_ZERO; return true; @@ -644,9 +651,15 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) static inline bool slab_want_init_on_free(struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, - &init_on_free)) - return !(c->ctor || - (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + &init_on_free)) { +#ifndef CONFIG_SLUB + if (c->ctor) + return false; +#endif + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return false; + return true; + } return false; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 238293b1dbe1..5026d9791e49 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -35,10 +35,10 @@ #define CREATE_TRACE_POINTS #include -enum slab_state slab_state; +enum slab_state slab_state __ro_after_init; LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); -struct kmem_cache *kmem_cache; +struct kmem_cache *kmem_cache __ro_after_init; static LIST_HEAD(slab_caches_to_rcu_destroy); static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); @@ -58,7 +58,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, /* * Merge control. If this is set then no merging of slab caches will occur. */ -static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); +static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); static int __init setup_slab_nomerge(char *str) { diff --git a/mm/slub.c b/mm/slub.c index 2ef88bbf56a3..e7e8e7b3176e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +237,12 @@ static inline bool slub_debug_orig_size(struct kmem_cache *s) (s->flags & SLAB_KMALLOC)); } +static inline bool has_sanitize_verify(struct kmem_cache *s) +{ + return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && + slab_want_init_on_free(s); +} + void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) @@ -732,6 +739,55 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, return false; } +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SLAB_CANARY) +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) +{ + return s->offset >= s->inuse; +} + +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else + return s->inuse; +} +#endif + +#ifdef CONFIG_SLAB_CANARY +static inline unsigned long *get_canary(struct kmem_cache *s, void *object) +{ + return object + get_info_end(s); +} + +static inline unsigned long get_canary_value(const void *canary, unsigned long value) +{ + return (value ^ (unsigned long)canary) & CANARY_MASK; +} + +static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) +{ + unsigned long *canary = get_canary(s, object); + *canary = get_canary_value(canary, value); +} + +static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) +{ + unsigned long *canary = get_canary(s, object); + BUG_ON(*canary != get_canary_value(canary, value)); +} +#else +#define set_canary(s, object, value) +#define check_canary(s, object, value) +#endif + #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -788,13 +844,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) * Debug settings: */ #if defined(CONFIG_SLUB_DEBUG_ON) -static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; +static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; #else -static slab_flags_t slub_debug; +static slab_flags_t slub_debug __ro_after_init; #endif -static char *slub_debug_string; -static int disable_higher_order_debug; +static char *slub_debug_string __ro_after_init; +static int disable_higher_order_debug __ro_after_init; /* * slub is about to manipulate internal object metadata. This memory lies @@ -845,26 +901,6 @@ static void print_section(char *level, char *text, u8 *addr, metadata_access_disable(); } -/* - * See comment in calculate_sizes(). - */ -static inline bool freeptr_outside_object(struct kmem_cache *s) -{ - return s->offset >= s->inuse; -} - -/* - * Return offset of the end of info block which is inuse + free pointer if - * not overlapping with object. - */ -static inline unsigned int get_info_end(struct kmem_cache *s) -{ - if (freeptr_outside_object(s)) - return s->inuse + sizeof(void *); - else - return s->inuse; -} - static struct track *get_track(struct kmem_cache *s, void *object, enum track_item alloc) { @@ -872,6 +908,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, p = object + get_info_end(s); + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + p = (void *)p + sizeof(void *); + return kasan_reset_tag(p + alloc); } @@ -1070,6 +1109,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) off = get_info_end(s); + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + off += sizeof(void *); + if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); @@ -1219,9 +1261,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, * Meta data starts here. * * A. Free pointer (if we cannot overwrite object on free) - * B. Tracking data for SLAB_STORE_USER - * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) - * D. Padding to reach required alignment boundary or at minimum + * B. Canary for SLAB_CANARY + * C. Tracking data for SLAB_STORE_USER + * D. Original request size for kmalloc object (SLAB_STORE_USER enabled) + * E. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -1239,6 +1282,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + off += sizeof(void *); + if (s->flags & SLAB_STORE_USER) { /* We also have user information there */ off += 2 * sizeof(struct track); @@ -2082,8 +2128,16 @@ void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, * was delayed by KASAN quarantine, or it was returned to KFENCE. */ static __always_inline -bool slab_free_hook(struct kmem_cache *s, void *x, bool init) +bool slab_free_hook(struct kmem_cache *s, void *x, bool init, bool canary) { + /* + * Postpone setting the inactive canary until the metadata + * has potentially been cleared at the end of this function. + */ + if (canary) { + check_canary(s, x, s->random_active); + } + kmemleak_free_recursive(x, s->flags); kmsan_slab_free(s, x); @@ -2116,7 +2170,14 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; memset((char *)kasan_reset_tag(x) + s->inuse, 0, s->size - s->inuse - rsize); + if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) + s->ctor(x); } + + if (canary) { + set_canary(s, x, s->random_inactive); + } + /* KASAN might put x into memory quarantine, delaying its reuse. */ return !kasan_slab_free(s, x, init); } @@ -2132,7 +2193,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, bool init; if (is_kfence_address(next)) { - slab_free_hook(s, next, false); + slab_free_hook(s, next, false, false); return false; } @@ -2147,7 +2208,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(slab_free_hook(s, object, init))) { + if (likely(slab_free_hook(s, object, init, true))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -2159,6 +2220,22 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, * accordingly if object's reuse is delayed. */ --(*cnt); + + /* Objects that are put into quarantine by KASAN will + * still undergo free_consistency_checks(), which + * checks whether the freelist pointer is valid if it + * is located after the object (see check_object()). + * Since this is the case for slab caches with + * constructors, we need to fix the freelist pointer + * after init_on_free has overwritten it. + * + * Note that doing this for all caches (not just ctor + * ones) would cause a GPF due to KASAN poisoning and + * the way set_freepointer() eventually dereferences + * the freepointer. + */ + if (slab_want_init_on_free(s) && s->ctor) + set_freepointer(s, object, NULL); } } while (object != old_tail); @@ -2168,8 +2245,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, static void *setup_object(struct kmem_cache *s, void *object) { setup_object_debug(s, object); + set_canary(s, object, s->random_inactive); object = kasan_init_slab_obj(s, object); - if (unlikely(s->ctor)) { + if (unlikely(s->ctor) && !has_sanitize_verify(s)) { kasan_unpoison_new_object(s, object); s->ctor(object); kasan_poison_new_object(s, object); @@ -3814,6 +3892,8 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); + if (p[i] && init && s->ctor) + s->ctor(p[i]); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); @@ -3850,7 +3930,24 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); maybe_wipe_obj_freeptr(s, object); - init = slab_want_init_on_alloc(gfpflags, s); + + if (has_sanitize_verify(s) && object) { + /* KASAN hasn't unpoisoned the object yet (this is done in the + * post-alloc hook), so let's do it temporarily. + */ + kasan_unpoison_new_object(s, object); + BUG_ON(memchr_inv(object, 0, s->object_size)); + if (s->ctor) + s->ctor(object); + kasan_poison_new_object(s, object); + } else { + init = slab_want_init_on_alloc(gfpflags, s); + } + + if (object) { + check_canary(s, object, s->random_inactive); + set_canary(s, object, s->random_active); + } out: /* @@ -4294,9 +4391,14 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) { + bool canary = true; + memcg_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + /* Make sure canaries are not used on kfence objects. */ + if (is_kfence_address(object)) + canary = false; + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), canary))) do_slab_free(s, slab, object, object, 1, addr); } @@ -4325,8 +4427,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) struct slab *slab; slab = virt_to_slab(obj); +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION + BUG_ON(!slab); +#else if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) return NULL; +#endif return slab->slab_cache; } @@ -4339,10 +4445,15 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; cachep = virt_to_cache(x); - if (WARN(cachep && cachep != s, - "%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name)) + if (cachep && cachep != s) { +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION + BUG(); +#else + WARN(1, "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); print_tracking(cachep, x); +#endif + } return cachep; } @@ -4368,8 +4479,12 @@ static void free_large_kmalloc(struct folio *folio, void *object) { unsigned int order = folio_order(folio); +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION + BUG_ON(order == 0); +#else if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); +#endif kmemleak_free(object); kasan_kfree_large(object); @@ -4536,7 +4651,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, { struct kmem_cache_cpu *c; unsigned long irqflags; - int i; + int i, k; /* * Drain objects in the per cpu slab, while disabling local @@ -4592,6 +4707,28 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); slub_put_cpu_ptr(s->cpu_slab); + if (has_sanitize_verify(s)) { + int j; + + for (j = 0; j < i; j++) { + /* KASAN hasn't unpoisoned the object yet (this is done in the + * post-alloc hook), so let's do it temporarily. + */ + kasan_unpoison_new_object(s, p[j]); + BUG_ON(memchr_inv(p[j], 0, s->object_size)); + if (s->ctor) + s->ctor(p[j]); + kasan_poison_new_object(s, p[j]); + } + } + + for (k = 0; k < i; k++) { + if (!is_kfence_address(p[k])) { + check_canary(s, p[k], s->random_inactive); + set_canary(s, p[k], s->random_active); + } + } + return i; error: @@ -4636,6 +4773,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, { int i; struct obj_cgroup *objcg = NULL; + bool init = false; if (!size) return 0; @@ -4652,8 +4790,11 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * Done outside of the IRQ disabled fastpath loop. */ if (likely(i != 0)) { + if (!has_sanitize_verify(s)) { + init = slab_want_init_on_alloc(flags, s); + } slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s), s->object_size); + init, s->object_size); } else { memcg_slab_alloc_error_hook(s, size, objcg); } @@ -4682,10 +4823,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); * and increases the number of allocations possible without having to * take the list_lock. */ -static unsigned int slub_min_order; -static unsigned int slub_max_order = +static unsigned int slub_min_order __ro_after_init; +static unsigned int slub_max_order __ro_after_init = IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER; -static unsigned int slub_min_objects; +static unsigned int slub_min_objects __ro_after_init; /* * Calculate the order of allocation given an slab object size. @@ -4869,6 +5010,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif + set_canary(kmem_cache_node, n, kmem_cache_node->random_active); n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; @@ -5035,6 +5177,9 @@ static int calculate_sizes(struct kmem_cache *s) s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); } + if (IS_ENABLED(CONFIG_SLAB_CANARY)) + size += sizeof(void *); + #ifdef CONFIG_SLUB_DEBUG if (flags & SLAB_STORE_USER) { /* @@ -5108,6 +5253,10 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif +#ifdef CONFIG_SLAB_CANARY + s->random_active = get_random_long(); + s->random_inactive = get_random_long(); +#endif if (!calculate_sizes(s)) goto error; @@ -5377,6 +5526,9 @@ void __check_heap_object(const void *ptr, unsigned long n, offset -= s->red_left_pad; } + if (!is_kfence) + check_canary(s, (void *)ptr - offset, s->random_active); + /* Allow address range falling entirely within usercopy region. */ if (offset >= s->useroffset && offset - s->useroffset <= s->usersize && diff --git a/mm/util.c b/mm/util.c index 5a6a9802583b..70890d727b88 100644 --- a/mm/util.c +++ b/mm/util.c @@ -372,9 +372,9 @@ unsigned long __weak arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) - return randomize_page(mm->brk, SZ_32M); + return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; - return randomize_page(mm->brk, SZ_1G); + return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; } unsigned long arch_mmap_rnd(void) diff --git a/net/core/dev.c b/net/core/dev.c index c9b8412f1c9d..6aaa681d2731 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5122,7 +5122,7 @@ int netif_rx(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx); -static __latent_entropy void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -6741,7 +6741,7 @@ static int napi_threaded_poll(void *data) return 0; } -static __latent_entropy void net_rx_action(struct softirq_action *h) +static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 50dc9970cad2..25398c88d164 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -267,6 +267,7 @@ config IP_PIMSM_V2 config SYN_COOKIES bool "IP: TCP syncookie support" + default y help Normal TCP/IP networking is open to an attack known as "SYN flooding". This denial-of-service attack prevents legitimate remote @@ -771,3 +772,26 @@ config TCP_MD5SIG on the Internet. If unsure, say N. + +config TCP_SIMULT_CONNECT_DEFAULT_ON + bool "Enable TCP simultaneous connect" + help + Enable TCP simultaneous connect that adds a weakness in Linux's strict + implementation of TCP that allows two clients to connect to each other + without either entering a listening state. The weakness allows an + attacker to easily prevent a client from connecting to a known server + provided the source port for the connection is guessed correctly. + + As the weakness could be used to prevent an antivirus or IPS from + fetching updates, or prevent an SSL gateway from fetching a CRL, it + should be eliminated by disabling this option. Though Linux is one of + few operating systems supporting simultaneous connect, it has no + legitimate use in practice and is rarely supported by firewalls. + + Disabling this may break TCP STUNT which is used by some applications + for NAT traversal. + + This setting can be overridden at runtime via the + net.ipv4.tcp_simult_connect sysctl. + + If unsure, say N. diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7e4f16a7dcc1..afc8bf021413 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -575,6 +575,15 @@ static struct ctl_table ipv4_table[] = { .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, + { + .procname = "tcp_simult_connect", + .data = &sysctl_tcp_simult_connect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8415aa41524e..d8557992a923 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -82,6 +82,7 @@ #include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; +int sysctl_tcp_simult_connect __read_mostly = IS_ENABLED(CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON); #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -6524,7 +6525,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, SKB_DR_SET(reason, TCP_RFC7323_PAWS); goto discard_and_undo; } - if (th->syn) { + if (th->syn && sysctl_tcp_simult_connect) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. * Particularly, it can be connect to self. diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 739402f45509..0e99fabc460a 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -45,6 +45,7 @@ modpost-args = \ $(if $(CONFIG_MODVERSIONS),-m) \ $(if $(CONFIG_MODULE_SRCVERSION_ALL),-a) \ $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ + $(if $(CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE),-f) \ $(if $(KBUILD_MODPOST_WARN),-w) \ $(if $(KBUILD_NSDEPS),-d $(MODULES_NSDEPS)) \ $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) \ diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index e383cda05367..929997fb5e45 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -39,6 +39,11 @@ config GCC_PLUGIN_LATENT_ENTROPY is some slowdown of the boot process (about 0.5%) and fork and irq processing. + When extra_latent_entropy is passed on the kernel command line, + entropy will be extracted from up to the first 4GB of RAM while the + runtime memory allocator is being initialized. This costs even more + slowdown of the boot process. + Note that entropy extracted this way is not cryptographically secure! diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index ce686ebf5591..0274a4e617d8 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -38,6 +38,8 @@ static bool sec_mismatch_warn_only = true; /* Trim EXPORT_SYMBOLs that are unused by in-tree modules */ static bool trim_unused_exports; +static int writable_fptr_count = 0; +static int writable_fptr_verbose = false; /* ignore missing files */ static bool ignore_missing_files; /* If set to 1, only warn (instead of error) about missing ns imports */ @@ -813,6 +815,7 @@ enum mismatch { ANY_INIT_TO_ANY_EXIT, ANY_EXIT_TO_ANY_INIT, EXTABLE_TO_NON_TEXT, + DATA_TO_TEXT }; /** @@ -875,6 +878,12 @@ static const struct sectioncheck sectioncheck[] = { .bad_tosec = { ".altinstr_replacement", NULL }, .good_tosec = {ALL_TEXT_SECTIONS , NULL}, .mismatch = EXTABLE_TO_NON_TEXT, +}, +/* Do not reference code from writable data */ +{ + .fromsec = { DATA_SECTIONS, NULL }, + .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, + .mismatch = DATA_TO_TEXT } }; @@ -1050,7 +1059,13 @@ static void default_mismatch_handler(const char *modname, struct elf_info *elf, if (!secref_whitelist(fromsec, fromsym, tosec, tosym)) return; - sec_mismatch_count++; + if (mismatch->mismatch == DATA_TO_TEXT) { + writable_fptr_count++; + if (!writable_fptr_verbose) + return; + } else { + sec_mismatch_count++; + } warn("%s: section mismatch in reference: %s+0x%x (section: %s) -> %s (section: %s)\n", modname, fromsym, @@ -1077,6 +1092,11 @@ static void default_mismatch_handler(const char *modname, struct elf_info *elf, else error("%s+0x%lx references non-executable section '%s'\n", fromsec, (long)faddr, tosec); + } else if (mismatch->mismatch == DATA_TO_TEXT) { + fprintf(stderr, + "The %s:%s references\n" + "the %s:%s\n", + fromsec, fromsym, tosec, tosym); } } @@ -2193,7 +2213,7 @@ int main(int argc, char **argv) LIST_HEAD(dump_lists); struct dump_list *dl, *dl2; - while ((opt = getopt(argc, argv, "ei:MmnT:to:au:WwENd:")) != -1) { + while ((opt = getopt(argc, argv, "ei:fMmnT:to:au:WwENd:")) != -1) { switch (opt) { case 'e': external_module = true; @@ -2203,6 +2223,9 @@ int main(int argc, char **argv) dl->file = optarg; list_add_tail(&dl->list, &dump_lists); break; + case 'f': + writable_fptr_verbose = true; + break; case 'M': module_enabled = true; break; @@ -2293,5 +2316,11 @@ int main(int argc, char **argv) warn("suppressed %u unresolved symbol warnings because there were too many)\n", nr_unresolved - MAX_UNRESOLVED_REPORTS); + if (writable_fptr_count && !writable_fptr_verbose) + warn("modpost: Found %d writable function pointer%s.\n" + "To see full details build your kernel with:\n" + "'make CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE=y'\n", + writable_fptr_count, (writable_fptr_count == 1 ? "" : "s")); + return error_occurred ? 1 : 0; } diff --git a/security/Kconfig b/security/Kconfig index 52c9af08ad35..889378df2713 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -9,7 +9,7 @@ source "security/keys/Kconfig" config SECURITY_DMESG_RESTRICT bool "Restrict unprivileged access to the kernel syslog" - default n + default y help This enforces restrictions on unprivileged users reading the kernel syslog via dmesg(8). @@ -19,10 +19,34 @@ config SECURITY_DMESG_RESTRICT If you are unsure how to answer this question, answer N. +config SECURITY_PERF_EVENTS_RESTRICT + bool "Restrict unprivileged use of performance events" + depends on PERF_EVENTS + default y + help + If you say Y here, the kernel.perf_event_paranoid sysctl + will be set to 3 by default, and no unprivileged use of the + perf_event_open syscall will be permitted unless it is + changed. + +config SECURITY_TIOCSTI_RESTRICT + bool "Restrict unprivileged use of tiocsti command injection" + default y + help + This enforces restrictions on unprivileged users injecting commands + into other processes which share a tty session using the TIOCSTI + ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. + + If this option is not selected, no restrictions will be enforced + unless the tiocsti_restrict sysctl is explicitly set to (1). + + If you are unsure how to answer this question, answer N. + config SECURITY bool "Enable different security models" depends on SYSFS depends on MULTIUSER + default y help This allows you to choose different security modules to be configured into your kernel. @@ -43,6 +67,7 @@ config SECURITYFS config SECURITY_NETWORK bool "Socket and Networking Security Hooks" depends on SECURITY + default y help This enables the socket and networking security hooks. If enabled, a security module can use these hooks to @@ -130,6 +155,7 @@ config LSM_MMAP_MIN_ADDR config HARDENED_USERCOPY bool "Harden memory copies between kernel and userspace" imply STRICT_DEVMEM + default y help This option checks for obviously wrong memory regions when copying memory to/from the kernel (via copy_to_user() and @@ -146,6 +172,7 @@ config FORTIFY_SOURCE depends on !CC_IS_CLANG || CLANG_VERSION >= 120001 # https://github.com/llvm/llvm-project/issues/53645 depends on !CC_IS_CLANG || !X86_32 + default y help Detect overflows of buffers in common string and memory functions where the compiler can determine and validate the buffer sizes. diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index 2cff851ebfd7..29552add6301 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -226,6 +226,7 @@ config STACKLEAK_RUNTIME_DISABLE config INIT_ON_ALLOC_DEFAULT_ON bool "Enable heap memory zeroing on allocation by default" + default yes depends on !KMSAN help This has the effect of setting "init_on_alloc=1" on the kernel @@ -239,6 +240,7 @@ config INIT_ON_ALLOC_DEFAULT_ON config INIT_ON_FREE_DEFAULT_ON bool "Enable heap memory zeroing on free by default" + default yes depends on !KMSAN help This has the effect of setting "init_on_free=1" on the kernel @@ -277,12 +279,28 @@ config ZERO_CALL_USED_REGS be evaluated for suitability. For example, x86_64 grows by less than 1%, and arm64 grows by about 5%. +config PAGE_SANITIZE_VERIFY + bool "Verify sanitized pages" + default y + help + When init_on_free is enabled, verify that newly allocated pages + are zeroed to detect write-after-free bugs. + +config SLAB_SANITIZE_VERIFY + bool "Verify sanitized SLAB allocations" + default y + depends on !KASAN + help + When init_on_free is enabled, verify that newly allocated slab + objects are zeroed to detect write-after-free bugs. + endmenu menu "Hardening of kernel data structures" config LIST_HARDENED bool "Check integrity of linked list manipulation" + default y help Minimal integrity checking in the linked-list manipulation routines to catch memory corruptions that are not guaranteed to result in an @@ -293,6 +311,7 @@ config LIST_HARDENED config BUG_ON_DATA_CORRUPTION bool "Trigger a BUG when data corruption is detected" select LIST_HARDENED + default y help Select this option if the kernel should BUG when it encounters data corruption in kernel memory structures when they get checked diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig index 61abc1e094a8..ceda05c0c4f6 100644 --- a/security/selinux/Kconfig +++ b/security/selinux/Kconfig @@ -3,7 +3,7 @@ config SECURITY_SELINUX bool "SELinux Support" depends on SECURITY_NETWORK && AUDIT && NET && INET select NETWORK_SECMARK - default n + default y help This selects Security-Enhanced Linux (SELinux). You will also need a policy configuration and a labeled filesystem. diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 71e6e7079d7f..97504870ea48 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -140,18 +140,6 @@ static int __init selinux_enabled_setup(char *str) __setup("selinux=", selinux_enabled_setup); #endif -static int __init checkreqprot_setup(char *str) -{ - unsigned long checkreqprot; - - if (!kstrtoul(str, 0, &checkreqprot)) { - if (checkreqprot) - pr_err("SELinux: checkreqprot set to 1 via kernel parameter. This is no longer supported.\n"); - } - return 1; -} -__setup("checkreqprot=", checkreqprot_setup); - /** * selinux_secmark_enabled - Check to see if SECMARK is currently enabled * diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 074d6c2714eb..92b7d8c310d9 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -697,20 +697,12 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, if (IS_ERR(page)) return PTR_ERR(page); - if (sscanf(page, "%u", &new_value) != 1) { + if (sscanf(page, "%u", &new_value) != 1 || new_value) { length = -EINVAL; goto out; } length = count; - if (new_value) { - char comm[sizeof(current->comm)]; - - memcpy(comm, current->comm, sizeof(comm)); - pr_err("SELinux: %s (%d) set checkreqprot to 1. This is no longer supported.\n", - comm, current->pid); - } - selinux_ima_measure_state(); out: diff --git a/security/yama/Kconfig b/security/yama/Kconfig index a810304123ca..b809050b25d2 100644 --- a/security/yama/Kconfig +++ b/security/yama/Kconfig @@ -2,7 +2,7 @@ config SECURITY_YAMA bool "Yama support" depends on SECURITY - default n + default y help This selects Yama, which extends DAC support with additional system-wide security settings beyond regular Linux discretionary diff --git a/tools/perf/Documentation/security.txt b/tools/perf/Documentation/security.txt index 4fe3b8b1958f..a7d88cc23a70 100644 --- a/tools/perf/Documentation/security.txt +++ b/tools/perf/Documentation/security.txt @@ -148,6 +148,7 @@ Perf tool provides a message similar to the one below: >= 0: Disallow raw and ftrace function tracepoint access >= 1: Disallow CPU event access >= 2: Disallow kernel profiling + >= 3: Disallow use of any event To make the adjusted perf_event_paranoid setting permanent preserve it in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = ) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 727dae445da9..95172e7a1b9a 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2990,6 +2990,7 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target, ">= 0: Disallow raw and ftrace function tracepoint access\n" ">= 1: Disallow CPU event access\n" ">= 2: Disallow kernel profiling\n" + ">= 3: Disallow use of any event\n" "To make the adjusted perf_event_paranoid setting permanent preserve it\n" "in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = )", perf_event_paranoid()); -- 2.44.0