diff options
269 files changed, 3920 insertions, 1277 deletions
diff --git a/Documentation/kbuild/00-INDEX b/Documentation/kbuild/00-INDEX index 8c5e6aa78004..b367200c94b5 100644 --- a/Documentation/kbuild/00-INDEX +++ b/Documentation/kbuild/00-INDEX @@ -1,5 +1,7 @@ 00-INDEX - this file: info on the kernel build process +llvm.txt + - how to compile Linux with llvm headers_install.txt - how to export Linux headers for use by userspace kbuild.txt diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt index 0ff6a466a05b..3e63c2361291 100644 --- a/Documentation/kbuild/kbuild.txt +++ b/Documentation/kbuild/kbuild.txt @@ -238,3 +238,14 @@ KBUILD_VMLINUX_MAIN All object files for the main part of vmlinux. KBUILD_VMLINUX_INIT and KBUILD_VMLINUX_MAIN together specify all the object files used to link vmlinux. + +KBUILD_VMLINUX_LIBS +-------------------------------------------------- +All .a "lib" files for vmlinux. +KBUILD_VMLINUX_INIT, KBUILD_VMLINUX_MAIN, and KBUILD_VMLINUX_LIBS together +specify all the object files used to link vmlinux. + +LLVM +---- +If this variable is set to 1, Kbuild will use Clang and LLVM utilities instead +of GCC and GNU binutils to build the kernel. diff --git a/Documentation/kbuild/llvm.txt b/Documentation/kbuild/llvm.txt new file mode 100644 index 000000000000..51c009baa35a --- /dev/null +++ b/Documentation/kbuild/llvm.txt @@ -0,0 +1,86 @@ +Building Linux with Clang/LLVM +============================== + +This document covers how to build the Linux kernel with Clang and LLVM +utilities. + +About +----- + +The Linux kernel has always traditionally been compiled with GNU +toolchains such as GCC and binutils. Ongoing work has allowed for +[Clang](https://clang.llvm.org/) and [LLVM](https://llvm.org/) utilities +to be used as viable substitutes. Distributions such as +[Android](https://www.android.com/), +[ChromeOS](https://www.chromium.org/chromium-os), and +[OpenMandriva](https://www.openmandriva.org/) use Clang built kernels. +[LLVM is a collection of toolchain components implemented in terms of +C++ objects](https://www.aosabook.org/en/llvm.html). Clang is a +front-end to LLVM that supports C and the GNU C extensions required by +the kernel, and is pronounced "klang," not "see-lang." + +Clang +----- + +The compiler used can be swapped out via [CC=]{.title-ref} command line +argument to [make]{.title-ref}. [CC=]{.title-ref} should be set when +selecting a config and during a build. + +> make CC=clang defconfig +> +> make CC=clang + +Cross Compiling +--------------- + +A single Clang compiler binary will typically contain all supported +backends, which can help simplify cross compiling. + +> ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make CC=clang + +[CROSS\_COMPILE]{.title-ref} is not used to prefix the Clang compiler +binary, instead [CROSS\_COMPILE]{.title-ref} is used to set a command +line flag: [--target <triple>]{.title-ref}. For example: + +> clang --target aarch64-linux-gnu foo.c + +LLVM Utilities +-------------- + +LLVM has substitutes for GNU binutils utilities. These can be invoked as +additional parameters to [make]{.title-ref}. + +> make CC=clang LD=ld.lld AR=llvm-ar NM=llvm-nm STRIP=llvm-strip \\ +> OBJCOPY=llvm-objcopy OBJDUMP=llvm-objdump OBJSIZE=llvm-size \\ +> READELF=llvm-readelf HOSTCC=clang HOSTCXX=clang++ HOSTAR=llvm-ar \\ +> HOSTLD=ld.lld + +Currently, the integrated assembler is disabled by default. You can pass +`LLVM_IAS=1` to enable it. + +Getting Help +------------ + +- [Website](https://clangbuiltlinux.github.io/) +- [Mailing + List](https://groups.google.com/forum/#!forum/clang-built-linux): + <clang-built-linux@googlegroups.com> +- [Issue Tracker](https://github.com/ClangBuiltLinux/linux/issues) +- IRC: \#clangbuiltlinux on chat.freenode.net +- [Telegram](https://t.me/ClangBuiltLinux): \@ClangBuiltLinux +- [Wiki](https://github.com/ClangBuiltLinux/linux/wiki) +- [Beginner + Bugs](https://github.com/ClangBuiltLinux/linux/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) + +Getting LLVM +------------ + +- <http://releases.llvm.org/download.html> +- <https://github.com/llvm/llvm-project> +- <https://llvm.org/docs/GettingStarted.html> +- <https://llvm.org/docs/CMake.html> +- <https://apt.llvm.org/> +- <https://www.archlinux.org/packages/extra/x86_64/llvm/> +- <https://github.com/ClangBuiltLinux/tc-build> +- <https://github.com/ClangBuiltLinux/linux/wiki/Building-Clang-from-source> +- <https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86/> diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index 64c84e2d06c0..fba7bed6193a 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -41,6 +41,7 @@ This document describes the Linux kernel Makefiles. --- 6.8 Custom kbuild commands --- 6.9 Preprocessing linker scripts --- 6.10 Generic header files + --- 6.11 Post-link pass === 7 Kbuild syntax for exported headers --- 7.1 no-export-headers-y @@ -1251,6 +1252,21 @@ When kbuild executes, the following steps are followed (roughly): For example, powerpc uses this to check relocation sanity of the linked vmlinux file. +--- 6.11 Post-link pass + + If the file arch/xxx/Makefile.postlink exists, this makefile + will be invoked for post-link objects (vmlinux and modules.ko) + for architectures to run post-link passes on. Must also handle + the clean target. + + This pass runs after kallsyms generation. If the architecture + needs to modify symbol locations, rather than manipulate the + kallsyms, it may be easier to add another postlink target for + .tmp_vmlinux? targets to be called from link-vmlinux.sh. + + For example, powerpc uses this to check relocation sanity of + the linked vmlinux file. + === 7 Kbuild syntax for exported headers The kernel includes a set of headers that is exported to userspace. @@ -301,8 +301,14 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ else if [ -x /bin/bash ]; then echo /bin/bash; \ else echo sh; fi ; fi) +ifneq ($(LLVM),) +HOSTCC = clang +HOSTCXX = clang++ +else HOSTCC = gcc HOSTCXX = g++ +endif + HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 HOSTCXXFLAGS = -O2 @@ -337,15 +343,27 @@ scripts/Kbuild.include: ; include scripts/Kbuild.include # Make variables (CC, etc...) +CPP = $(CC) -E +ifneq ($(LLVM),) +CC = clang +LD = ld.lld +AR = llvm-ar +NM = llvm-nm +OBJCOPY = llvm-objcopy +OBJDUMP = llvm-objdump +READELF = llvm-readelf +OBJSIZE = llvm-size +STRIP = llvm-strip +else AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld CC = $(CROSS_COMPILE)gcc -CPP = $(CC) -E AR = $(CROSS_COMPILE)ar NM = $(CROSS_COMPILE)nm STRIP = $(CROSS_COMPILE)strip OBJCOPY = $(CROSS_COMPILE)objcopy OBJDUMP = $(CROSS_COMPILE)objdump +endif AWK = awk GENKSYMS = scripts/genksyms/genksyms INSTALLKERNEL := installkernel @@ -400,6 +418,7 @@ KBUILD_AFLAGS := -D__ASSEMBLY__ $(call cc-option,-fno-PIE) KBUILD_AFLAGS_MODULE := -DMODULE KBUILD_CFLAGS_MODULE := -DMODULE KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds +LDFLAGS := CLANG_FLAGS := # Read KERNELRELEASE from include/config/kernel.release (if it exists) @@ -622,7 +641,9 @@ endif ifneq ($(GCC_TOOLCHAIN),) CLANG_FLAGS += --gcc-toolchain=$(GCC_TOOLCHAIN) endif +ifneq ($(LLVM_IAS),1) CLANG_FLAGS += -no-integrated-as +endif CLANG_FLAGS += -Werror=unknown-warning-option CLANG_FLAGS += $(call cc-option, -Wno-misleading-indentation) CLANG_FLAGS += $(call cc-option, -Wno-bool-operation) @@ -646,6 +667,11 @@ KBUILD_CFLAGS += $(call cc-disable-warning, int-in-bool-context) KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, attribute-alias) +ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION +KBUILD_CFLAGS += $(call cc-option,-ffunction-sections,) +KBUILD_CFLAGS += $(call cc-option,-fdata-sections,) +endif + ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os else @@ -772,8 +798,11 @@ KBUILD_CFLAGS += $(call cc-option, -gsplit-dwarf, -g) else KBUILD_CFLAGS += -g endif +ifneq ($(LLVM_IAS),1) KBUILD_AFLAGS += -Wa,-gdwarf-2 endif +endif + ifdef CONFIG_DEBUG_INFO_DWARF4 KBUILD_CFLAGS += $(call cc-option, -gdwarf-4,) endif @@ -883,6 +912,10 @@ LDFLAGS_BUILD_ID = $(patsubst -Wl$(comma)%,%,\ KBUILD_LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID) LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID) +ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION +LDFLAGS_vmlinux += $(call ld-option, --gc-sections,) +endif + ifeq ($(CONFIG_STRIP_ASM_SYMS),y) LDFLAGS_vmlinux += $(call ld-option, -X,) endif @@ -985,22 +1018,27 @@ core-y := $(patsubst %/, %/built-in.o, $(core-y)) drivers-y := $(patsubst %/, %/built-in.o, $(drivers-y)) net-y := $(patsubst %/, %/built-in.o, $(net-y)) libs-y1 := $(patsubst %/, %/lib.a, $(libs-y)) -libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y)) -libs-y := $(libs-y1) $(libs-y2) +libs-y2 := $(filter-out %.a, $(patsubst %/, %/built-in.o, $(libs-y))) virt-y := $(patsubst %/, %/built-in.o, $(virt-y)) # Externally visible symbols (used by link-vmlinux.sh) export KBUILD_VMLINUX_INIT := $(head-y) $(init-y) -export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y) +export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y2) $(drivers-y) $(net-y) $(virt-y) +export KBUILD_VMLINUX_LIBS := $(libs-y1) export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds export LDFLAGS_vmlinux # used by scripts/pacmage/Makefile export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools) -vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN) +vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN) $(KBUILD_VMLINUX_LIBS) + +ARCH_POSTLINK := $(wildcard $(srctree)/arch/$(SRCARCH)/Makefile.postlink) # Final link of vmlinux - cmd_link-vmlinux = $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) + cmd_link-vmlinux = \ + $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux); \ + $(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true) + quiet_cmd_link-vmlinux = LINK $@ # Include targets which we want to @@ -1018,8 +1056,16 @@ endif ifdef CONFIG_GDB_SCRIPTS $(Q)ln -fsn `cd $(srctree) && /bin/pwd`/scripts/gdb/vmlinux-gdb.py endif +ifdef CONFIG_TRIM_UNUSED_KSYMS + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh \ + "$(MAKE) KBUILD_MODULES=1 -f $(srctree)/Makefile vmlinux_prereq" +endif +$(call if_changed,link-vmlinux) +# standalone target for easier testing +include/generated/autoksyms.h: FORCE + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh true + # The actual objects are generated when descending, # make sure no implicit rule kicks in $(sort $(vmlinux-deps)): $(vmlinux-dirs) ; @@ -1295,6 +1341,7 @@ $(clean-dirs): vmlinuxclean: $(Q)$(CONFIG_SHELL) $(srctree)/scripts/link-vmlinux.sh clean + $(Q)$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) clean) clean: archclean vmlinuxclean diff --git a/arch/Kconfig b/arch/Kconfig index a29c6098fb98..b8629dc08ae0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -432,6 +432,25 @@ config HAVE_ARCH_WITHIN_STACK_FRAMES and similar) by implementing an inline arch_within_stack_frames(), which is used by CONFIG_HARDENED_USERCOPY. +config THIN_ARCHIVES + def_bool y + help + Select this if the architecture wants to use thin archives + instead of ld -r to create the built-in.o files. + +config LD_DEAD_CODE_DATA_ELIMINATION + bool + help + Select this if the architecture wants to do dead code and + data elimination with the linker by compiling with + -ffunction-sections -fdata-sections and linking with + --gc-sections. + + This requires that the arch annotates or otherwise protects + its external entry points from being discarded. Linker scripts + must also merge .text.*, .data.*, and .bss.* correctly into + output sections. + config HAVE_CONTEXT_TRACKING bool help diff --git a/arch/arm/boot/dts/qcom/msm8996-v3.dtsi b/arch/arm/boot/dts/qcom/msm8996-v3.dtsi index 711e0c259c6a..aaf6b4250459 100644 --- a/arch/arm/boot/dts/qcom/msm8996-v3.dtsi +++ b/arch/arm/boot/dts/qcom/msm8996-v3.dtsi @@ -941,7 +941,6 @@ }; qcom,msm-thermal { msm_thermal_freq: qcom,vdd-apps-rstr{ - qcom,max-freq-level = <1190400>; qcom,levels = <1036800 1555200 1555200>; }; qcom,vdd-gfx-rstr{ diff --git a/arch/arm/boot/dts/qcom/msm8996pro.dtsi b/arch/arm/boot/dts/qcom/msm8996pro.dtsi index f140e78a16d7..d0949d70c3ca 100644 --- a/arch/arm/boot/dts/qcom/msm8996pro.dtsi +++ b/arch/arm/boot/dts/qcom/msm8996pro.dtsi @@ -1332,7 +1332,6 @@ qcom,limit-temp = <80>; qcom,core-limit-temp = <90>; msm_thermal_freq: qcom,vdd-apps-rstr { - qcom,max-freq-level = <1209600>; qcom,levels = <1056000 1516800 1516800>; }; qcom,vdd-gfx-rstr{ diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 95a000515e43..7d656525a671 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on MMU && OF select PREEMPT_NOTIFIERS - select ANON_INODES select ARM_GIC select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_ARCH_TLB_FLUSH_ALL diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index f4e9585226f3..fe0fb38df196 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -405,3 +405,4 @@ 388 common userfaultfd sys_userfaultfd 389 common membarrier sys_membarrier 390 common mlock2 sys_mlock2 +434 common pidfd_open sys_pidfd_open diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8b055133ae2a..28f09a00143b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -458,18 +458,13 @@ config ARM64_ERRATUM_845719 config ARM64_ERRATUM_843419 bool "Cortex-A53: 843419: A load or store might access an incorrect address" - depends on MODULES default y - select ARM64_MODULE_CMODEL_LARGE + select ARM64_MODULE_CMODEL_LARGE if MODULES help - This option builds kernel modules using the large memory model in - order to avoid the use of the ADRP instruction, which can cause - a subsequent memory access to use an incorrect address on Cortex-A53 - parts up to r0p4. - - Note that the kernel itself must be linked with a version of ld - which fixes potentially affected ADRP instructions through the - use of veneers. + This option links the kernel with '--fix-cortex-a53-843419' and + builds modules using the large memory model in order to avoid the use + of the ADRP instruction, which can cause a subsequent memory access + to use an incorrect address on Cortex-A53 parts up to r0p4. If unsure, say Y. diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 4df2b48fae44..adc88e707c09 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -10,13 +10,34 @@ # # Copyright (C) 1995-2001 by Russell King -LDFLAGS_vmlinux :=--no-undefined -X +LDFLAGS_vmlinux :=--no-undefined -X -z norelro CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET) OBJCOPYFLAGS :=-O binary -R .note -R .note.gnu.build-id -R .comment -S GZFLAGS :=-9 -ifneq ($(CONFIG_RELOCATABLE),) -LDFLAGS_vmlinux += -pie -Bsymbolic +ifeq ($(CONFIG_RELOCATABLE), y) +# Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour +# for relative relocs, since this leads to better Image compression +# with the relocation offsets always being zero. +LDFLAGS_vmlinux += -shared -Bsymbolic -z notext \ + $(call ld-option, --no-apply-dynamic-relocs) +endif + +ifeq ($(CONFIG_ARM64_ERRATUM_843419),y) + ifeq ($(call ld-option, --fix-cortex-a53-843419),) +$(warning ld does not support --fix-cortex-a53-843419; kernel may be susceptible to erratum) + else + ifeq ($(call gold-ifversion, -lt, 114000000, y), y) +$(warning This version of GNU gold may generate incorrect code with --fix-cortex-a53-843419;\ + see https://sourceware.org/bugzilla/show_bug.cgi?id=21491) + endif +LDFLAGS_vmlinux += --fix-cortex-a53-843419 + endif +else + ifeq ($(ld-name),gold) +# Pass --no-fix-cortex-a53-843419 to ensure the erratum fix is disabled +LDFLAGS += --no-fix-cortex-a53-843419 + endif endif KBUILD_DEFCONFIG := defconfig @@ -75,11 +96,15 @@ KBUILD_AFLAGS += $(lseinstr) $(vdso32) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian AS += -EB -LD += -EB +# Prefer the baremetal ELF build target, but not all toolchains include +LDFLAGS += -EB $(call ld-option, -maarch64elfb, -maarch64linuxb) +UTS_MACHINE := aarch64_be else KBUILD_CPPFLAGS += -mlittle-endian AS += -EL -LD += -EL +# Same as above, prefer ELF but fall back to linux target if needed. +LDFLAGS += -EL $(call ld-option, -maarch64elf, -maarch64linux) +UTS_MACHINE := aarch64 endif CHECKFLAGS += -D__aarch64__ diff --git a/arch/arm64/configs/msmcortex_defconfig b/arch/arm64/configs/msmcortex_defconfig index 478140987c2d..a205cdab4c6f 100644 --- a/arch/arm64/configs/msmcortex_defconfig +++ b/arch/arm64/configs/msmcortex_defconfig @@ -703,11 +703,11 @@ CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y CONFIG_IPC_LOGGING=y CONFIG_QCOM_RTB=y CONFIG_QCOM_RTB_SEPARATE_CPUS=y -CONFIG_FUNCTION_TRACER=y +#CONFIG_FUNCTION_TRACER=y CONFIG_PREEMPTIRQ_EVENTS=y -CONFIG_IRQSOFF_TRACER=y -CONFIG_PREEMPT_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y +#CONFIG_IRQSOFF_TRACER=y +#CONFIG_PREEMPT_TRACER=y +#CONFIG_BLK_DEV_IO_TRACE=y CONFIG_CPU_FREQ_SWITCH_PROFILER=y CONFIG_LKDTM=y CONFIG_MEMTEST=y diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index f0a8f2475ea3..1071dfcdaf1b 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -21,7 +21,7 @@ obj-$(CONFIG_CRYPTO_POLY_HASH_ARM64_CE) += poly-hash-ce.o poly-hash-ce-y := poly-hash-ce-glue.o poly-hash-ce-core.o obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S new file mode 100644 index 000000000000..8efdfdade393 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-core.S @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .arch armv8-a+crypto + +ENTRY(__aes_ce_encrypt) + sub w3, w3, #2 + ld1 {v0.16b}, [x2] + ld1 {v1.4s}, [x0], #16 + cmp w3, #10 + bmi 0f + bne 3f + mov v3.16b, v1.16b + b 2f +0: mov v2.16b, v1.16b + ld1 {v3.4s}, [x0], #16 +1: aese v0.16b, v2.16b + aesmc v0.16b, v0.16b +2: ld1 {v1.4s}, [x0], #16 + aese v0.16b, v3.16b + aesmc v0.16b, v0.16b +3: ld1 {v2.4s}, [x0], #16 + subs w3, w3, #3 + aese v0.16b, v1.16b + aesmc v0.16b, v0.16b + ld1 {v3.4s}, [x0], #16 + bpl 1b + aese v0.16b, v2.16b + eor v0.16b, v0.16b, v3.16b + st1 {v0.16b}, [x1] + ret +ENDPROC(__aes_ce_encrypt) + +ENTRY(__aes_ce_decrypt) + sub w3, w3, #2 + ld1 {v0.16b}, [x2] + ld1 {v1.4s}, [x0], #16 + cmp w3, #10 + bmi 0f + bne 3f + mov v3.16b, v1.16b + b 2f +0: mov v2.16b, v1.16b + ld1 {v3.4s}, [x0], #16 +1: aesd v0.16b, v2.16b + aesimc v0.16b, v0.16b +2: ld1 {v1.4s}, [x0], #16 + aesd v0.16b, v3.16b + aesimc v0.16b, v0.16b +3: ld1 {v2.4s}, [x0], #16 + subs w3, w3, #3 + aesd v0.16b, v1.16b + aesimc v0.16b, v0.16b + ld1 {v3.4s}, [x0], #16 + bpl 1b + aesd v0.16b, v2.16b + eor v0.16b, v0.16b, v3.16b + st1 {v0.16b}, [x1] + ret +ENDPROC(__aes_ce_decrypt) + +/* + * __aes_ce_sub() - use the aese instruction to perform the AES sbox + * substitution on each byte in 'input' + */ +ENTRY(__aes_ce_sub) + dup v1.4s, w0 + movi v0.16b, #0 + aese v0.16b, v1.16b + umov w0, v0.s[0] + ret +ENDPROC(__aes_ce_sub) + +ENTRY(__aes_ce_invert) + ld1 {v0.4s}, [x1] + aesimc v1.16b, v0.16b + st1 {v1.4s}, [x0] + ret +ENDPROC(__aes_ce_invert) diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-glue.c index 50d9fe11d0c8..60af7bddfbb9 100644 --- a/arch/arm64/crypto/aes-ce-cipher.c +++ b/arch/arm64/crypto/aes-ce-glue.c @@ -24,6 +24,13 @@ struct aes_block { u8 b[AES_BLOCK_SIZE]; }; +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + +asmlinkage u32 __aes_ce_sub(u32 l); +asmlinkage void __aes_ce_invert(struct aes_block *out, + const struct aes_block *in); + static int num_rounds(struct crypto_aes_ctx *ctx) { /* @@ -39,113 +46,21 @@ static int num_rounds(struct crypto_aes_ctx *ctx) static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - struct aes_block *out = (struct aes_block *)dst; - struct aes_block const *in = (struct aes_block *)src; - void *dummy0; - int dummy1; kernel_neon_begin_partial(4); - - __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.16b}, [%[key]], #16 ;" - " cmp %w[rounds], #10 ;" - " bmi 0f ;" - " bne 3f ;" - " mov v3.16b, v1.16b ;" - " b 2f ;" - "0: mov v2.16b, v1.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" - "1: aese v0.16b, v2.16b ;" - " aesmc v0.16b, v0.16b ;" - "2: ld1 {v1.16b}, [%[key]], #16 ;" - " aese v0.16b, v3.16b ;" - " aesmc v0.16b, v0.16b ;" - "3: ld1 {v2.16b}, [%[key]], #16 ;" - " subs %w[rounds], %w[rounds], #3 ;" - " aese v0.16b, v1.16b ;" - " aesmc v0.16b, v0.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" - " bpl 1b ;" - " aese v0.16b, v2.16b ;" - " eor v0.16b, v0.16b, v3.16b ;" - " st1 {v0.16b}, %[out] ;" - - : [out] "=Q"(*out), - [key] "=r"(dummy0), - [rounds] "=r"(dummy1) - : [in] "Q"(*in), - "1"(ctx->key_enc), - "2"(num_rounds(ctx) - 2) - : "cc"); - + __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx)); kernel_neon_end(); } static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - struct aes_block *out = (struct aes_block *)dst; - struct aes_block const *in = (struct aes_block *)src; - void *dummy0; - int dummy1; kernel_neon_begin_partial(4); - - __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.16b}, [%[key]], #16 ;" - " cmp %w[rounds], #10 ;" - " bmi 0f ;" - " bne 3f ;" - " mov v3.16b, v1.16b ;" - " b 2f ;" - "0: mov v2.16b, v1.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" - "1: aesd v0.16b, v2.16b ;" - " aesimc v0.16b, v0.16b ;" - "2: ld1 {v1.16b}, [%[key]], #16 ;" - " aesd v0.16b, v3.16b ;" - " aesimc v0.16b, v0.16b ;" - "3: ld1 {v2.16b}, [%[key]], #16 ;" - " subs %w[rounds], %w[rounds], #3 ;" - " aesd v0.16b, v1.16b ;" - " aesimc v0.16b, v0.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" - " bpl 1b ;" - " aesd v0.16b, v2.16b ;" - " eor v0.16b, v0.16b, v3.16b ;" - " st1 {v0.16b}, %[out] ;" - - : [out] "=Q"(*out), - [key] "=r"(dummy0), - [rounds] "=r"(dummy1) - : [in] "Q"(*in), - "1"(ctx->key_dec), - "2"(num_rounds(ctx) - 2) - : "cc"); - + __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx)); kernel_neon_end(); } -/* - * aes_sub() - use the aese instruction to perform the AES sbox substitution - * on each byte in 'input' - */ -static u32 aes_sub(u32 input) -{ - u32 ret; - - __asm__("dup v1.4s, %w[in] ;" - "movi v0.16b, #0 ;" - "aese v0.16b, v1.16b ;" - "umov %w[out], v0.4s[0] ;" - - : [out] "=r"(ret) - : [in] "r"(input) - : "v0","v1"); - - return ret; -} - int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len) { @@ -174,9 +89,9 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, u32 *rko = rki + kwords; #ifndef CONFIG_CPU_BIG_ENDIAN - rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0]; + rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0]; #else - rko[0] = rol32(aes_sub(rki[kwords - 1]), 8) ^ (rcon[i] << 24) ^ + rko[0] = rol32(__aes_ce_sub(rki[kwords - 1]), 8) ^ (rcon[i] << 24) ^ rki[0]; #endif rko[1] = rko[0] ^ rki[1]; @@ -191,7 +106,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, } else if (key_len == AES_KEYSIZE_256) { if (i >= 6) break; - rko[4] = aes_sub(rko[3]) ^ rki[4]; + rko[4] = __aes_ce_sub(rko[3]) ^ rki[4]; rko[5] = rko[4] ^ rki[5]; rko[6] = rko[5] ^ rki[6]; rko[7] = rko[6] ^ rki[7]; @@ -210,13 +125,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, key_dec[0] = key_enc[j]; for (i = 1, j--; j > 0; i++, j--) - __asm__("ld1 {v0.16b}, %[in] ;" - "aesimc v1.16b, v0.16b ;" - "st1 {v1.16b}, %[out] ;" - - : [out] "=Q"(key_dec[i]) - : [in] "Q"(key_enc[j]) - : "v0","v1"); + __aes_ce_invert(key_dec + i, key_enc + j); key_dec[i] = key_enc[0]; kernel_neon_end(); diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index 838dad5c209f..067b4e222f74 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -297,17 +297,19 @@ AES_ENTRY(aes_ctr_encrypt) eor v1.16b, v1.16b, v3.16b st1 {v0.16b-v1.16b}, [x0], #32 #else - ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ - dup v7.4s, w8 + add w7, w6, #1 mov v0.16b, v4.16b - add v7.4s, v7.4s, v8.4s + add w8, w6, #2 mov v1.16b, v4.16b - rev32 v8.16b, v7.16b + add w9, w6, #3 mov v2.16b, v4.16b + rev w7, w7 mov v3.16b, v4.16b - mov v1.s[3], v8.s[0] - mov v2.s[3], v8.s[1] - mov v3.s[3], v8.s[2] + rev w8, w8 + mov v1.s[3], w7 + rev w9, w9 + mov v2.s[3], w8 + mov v3.s[3], w9 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ do_encrypt_block4x eor v0.16b, v5.16b, v0.16b diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 7dcfd83ff5e8..5b7de95d6a48 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -329,21 +329,33 @@ lr .req x30 // link register * size: size of the region * Corrupts: kaddr, size, tmp1, tmp2 */ + .macro __dcache_op_workaround_clean_cache, op, kaddr +alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE + dc \op, \kaddr +alternative_else + dc civac, \kaddr +alternative_endif + .endm + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 dcache_line_size \tmp1, \tmp2 add \size, \kaddr, \size sub \tmp2, \tmp1, #1 bic \kaddr, \kaddr, \tmp2 9998: - .if (\op == cvau || \op == cvac) -alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE - dc \op, \kaddr -alternative_else - dc civac, \kaddr -alternative_endif + .ifc \op, cvau + __dcache_op_workaround_clean_cache \op, \kaddr + .else + .ifc \op, cvac + __dcache_op_workaround_clean_cache \op, \kaddr + .else + .ifc \op, cvap + sys 3, c7, c12, 1, \kaddr // dc cvap .else dc \op, \kaddr .endif + .endif + .endif add \kaddr, \kaddr, \tmp1 cmp \kaddr, \size b.lo 9998b diff --git a/arch/arm64/include/asm/opcodes.h b/arch/arm64/include/asm/opcodes.h deleted file mode 100644 index 123f45d92cd1..000000000000 --- a/arch/arm64/include/asm/opcodes.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef CONFIG_CPU_BIG_ENDIAN -#define CONFIG_CPU_ENDIAN_BE8 CONFIG_CPU_BIG_ENDIAN -#endif - -#include <../../arm/include/asm/opcodes.h> diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 5af574d632fa..6a5b28904c33 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -15,21 +15,22 @@ #ifndef _ARM_PROBES_H #define _ARM_PROBES_H -#include <asm/opcodes.h> - -struct kprobe; -struct arch_specific_insn; - -typedef u32 kprobe_opcode_t; -typedef void (kprobes_handler_t) (u32 opcode, long addr, struct pt_regs *); +typedef u32 probe_opcode_t; +typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *); /* architecture specific copy of original instruction */ -struct arch_specific_insn { - kprobe_opcode_t *insn; +struct arch_probe_insn { + probe_opcode_t *insn; pstate_check_t *pstate_cc; - kprobes_handler_t *handler; + probes_handler_t *handler; /* restore address after step xol */ unsigned long restore; }; +#ifdef CONFIG_KPROBES +typedef u32 kprobe_opcode_t; +struct arch_specific_insn { + struct arch_probe_insn api; +}; +#endif #endif diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index c768daa084ca..6bc2f3d28b68 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -22,8 +22,6 @@ #include <linux/stringify.h> -#include <asm/opcodes.h> - /* * ARMv8 ARM reserves the following encoding for system registers: * (Ref: ARMv8 ARM, Section: "System instruction class encoding overview", @@ -37,6 +35,14 @@ #define sys_reg(op0, op1, crn, crm, op2) \ ((((op0)&3)<<19)|((op1)<<16)|((crn)<<12)|((crm)<<8)|((op2)<<5)) +#ifdef __ASSEMBLY__ +// The space separator is omitted so that __emit_inst(x) can be parsed as +// either an assembler directive or an assembler macro argument. +#define __emit_inst(x) .inst(x) +#else +#define __emit_inst(x) ".inst " __stringify((x)) "\n\t" +#endif + #define SYS_MIDR_EL1 sys_reg(3, 0, 0, 0, 0) #define SYS_MPIDR_EL1 sys_reg(3, 0, 0, 0, 5) #define SYS_REVIDR_EL1 sys_reg(3, 0, 0, 0, 6) @@ -81,10 +87,10 @@ #define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4) #define REG_PSTATE_UAO_IMM sys_reg(0, 0, 4, 0, 3) -#define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\ - (!!x)<<8 | 0x1f) -#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\ - (!!x)<<8 | 0x1f) +#define SET_PSTATE_PAN(x) __emit_inst(0xd5000000 | REG_PSTATE_PAN_IMM | \ + (!!x)<<8 | 0x1f) +#define SET_PSTATE_UAO(x) __emit_inst(0xd5000000 | REG_PSTATE_UAO_IMM | \ + (!!x)<<8 | 0x1f) /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_EE (1 << 25) diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 41e58fe3c041..63469acc5992 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -44,7 +44,7 @@ #define __ARM_NR_compat_cacheflush (__ARM_NR_COMPAT_BASE+2) #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE+5) -#define __NR_compat_syscalls 390 +#define __NR_compat_syscalls 435 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 5b925b761a2a..65e8559fe4aa 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -801,6 +801,8 @@ __SYSCALL(__NR_execveat, compat_sys_execveat) __SYSCALL(__NR_userfaultfd, sys_userfaultfd) #define __NR_membarrier 389 __SYSCALL(__NR_membarrier, sys_membarrier) +#define __NR_pidfd_open 434 +__SYSCALL(__NR_pidfd_open, sys_pidfd_open) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/include/uapi/asm/setup.h b/arch/arm64/include/uapi/asm/setup.h index 9cf2e46fbbdf..7474c813d5ab 100644 --- a/arch/arm64/include/uapi/asm/setup.h +++ b/arch/arm64/include/uapi/asm/setup.h @@ -21,6 +21,6 @@ #include <linux/types.h> -#define COMMAND_LINE_SIZE 2048 +#define COMMAND_LINE_SIZE 4096 #endif diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 18938199a838..cd3597a75b3b 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -10,6 +10,8 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_insn.o = -pg CFLAGS_REMOVE_return_address.o = -pg +CFLAGS_setup.o = -DUTS_MACHINE='"$(UTS_MACHINE)"' + # Object file lists. arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ entry-fpsimd.o process.o ptrace.o setup.o signal.o \ diff --git a/arch/arm64/kernel/app_setting.c b/arch/arm64/kernel/app_setting.c index 0c6b00317645..9ee7a753c080 100644 --- a/arch/arm64/kernel/app_setting.c +++ b/arch/arm64/kernel/app_setting.c @@ -28,7 +28,7 @@ static struct kparam_string kps = { .string = lib_str, .maxlen = MAX_LEN, }; -static int set_name(const char *str, struct kernel_param *kp); +static int set_name(const char *str, const struct kernel_param *kp); module_param_call(lib_name, set_name, param_get_string, &kps, S_IWUSR); bool use_app_setting = true; @@ -43,7 +43,7 @@ bool use_32bit_app_setting_pro; module_param(use_32bit_app_setting_pro, bool, 0644); MODULE_PARM_DESC(use_32bit_app_setting_pro, "control use of 32 bit app specific settings"); -static int set_name(const char *str, struct kernel_param *kp) +static int set_name(const char *str, const struct kernel_param *kp) { int len = strlen(str); char *name; diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index e012ecd018ee..89160fcccf8b 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -16,7 +16,6 @@ #include <asm/cpufeature.h> #include <asm/insn.h> -#include <asm/opcodes.h> #include <asm/sysreg.h> #include <asm/system_misc.h> #include <asm/traps.h> @@ -63,6 +62,7 @@ struct insn_emulation { static LIST_HEAD(insn_emulation); static int nr_insn_emulated __initdata; static DEFINE_RAW_SPINLOCK(insn_emulation_lock); +static DEFINE_MUTEX(insn_emulation_mutex); static void register_emulation_hooks(struct insn_emulation_ops *ops) { @@ -208,10 +208,10 @@ static int emulation_proc_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret = 0; - struct insn_emulation *insn = (struct insn_emulation *) table->data; + struct insn_emulation *insn = container_of(table->data, struct insn_emulation, current_mode); enum insn_emulation_mode prev_mode = insn->current_mode; - table->data = &insn->current_mode; + mutex_lock(&insn_emulation_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write || prev_mode == insn->current_mode) @@ -224,7 +224,7 @@ static int emulation_proc_handler(struct ctl_table *table, int write, update_insn_emulation_mode(insn, INSN_UNDEF); } ret: - table->data = insn; + mutex_unlock(&insn_emulation_mutex); return ret; } @@ -254,7 +254,7 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) sysctl->maxlen = sizeof(int); sysctl->procname = insn->ops->name; - sysctl->data = insn; + sysctl->data = &insn->current_mode; sysctl->extra1 = &insn->min; sysctl->extra2 = &insn->max; sysctl->proc_handler = emulation_proc_handler; @@ -366,6 +366,10 @@ static int emulate_swpX(unsigned int address, unsigned int *data, return res; } +#define ARM_OPCODE_CONDTEST_FAIL 0 +#define ARM_OPCODE_CONDTEST_PASS 1 +#define ARM_OPCODE_CONDTEST_UNCOND 2 + #define ARM_OPCODE_CONDITION_UNCOND 0xf static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index f0ca0eb3b077..242f008666e5 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -144,7 +144,6 @@ alternative_else_nop_endif #else str x20, [tsk, #TI_ADDR_LIMIT] #endif - ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO) .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 86d444f9c2c1..0c46b0b5ba29 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -73,7 +73,11 @@ #ifdef CONFIG_EFI -__efistub_stext_offset = stext - _text; +/* + * Use ABSOLUTE() to avoid ld.lld treating this as a relative symbol: + * https://github.com/ClangBuiltLinux/linux/issues/561 + */ +__efistub_stext_offset = ABSOLUTE(stext - _text); /* * Prevent the symbol aliases below from being emitted into the kallsyms diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index a3f8f8bbfc92..59a4139b3294 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -30,7 +30,6 @@ #include <asm/cacheflush.h> #include <asm/debug-monitors.h> #include <asm/fixmap.h> -#include <asm/opcodes.h> #include <asm/insn.h> #define AARCH64_INSN_SF_BIT BIT(31) diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds index 8949f6c6f729..05881e2b414c 100644 --- a/arch/arm64/kernel/module.lds +++ b/arch/arm64/kernel/module.lds @@ -1,3 +1,3 @@ SECTIONS { - .plt (NOLOAD) : { BYTE(0) } + .plt : { BYTE(0) } } diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index f7931d900bca..6ece25660da0 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -77,8 +77,8 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD If instruction is supported and uses instruction slot, * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ -static enum kprobe_insn __kprobes -arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) +enum probe_insn __kprobes +arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) { /* * Instructions reading or modifying the PC won't work from the XOL @@ -88,26 +88,26 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD; if (aarch64_insn_is_bcond(insn)) { - asi->handler = simulate_b_cond; + api->handler = simulate_b_cond; } else if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn)) { - asi->handler = simulate_cbz_cbnz; + api->handler = simulate_cbz_cbnz; } else if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { - asi->handler = simulate_tbz_tbnz; + api->handler = simulate_tbz_tbnz; } else if (aarch64_insn_is_adr_adrp(insn)) { - asi->handler = simulate_adr_adrp; + api->handler = simulate_adr_adrp; } else if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { - asi->handler = simulate_b_bl; + api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || aarch64_insn_is_blr(insn) || aarch64_insn_is_ret(insn)) { - asi->handler = simulate_br_blr_ret; + api->handler = simulate_br_blr_ret; } else if (aarch64_insn_is_ldr_lit(insn)) { - asi->handler = simulate_ldr_literal; + api->handler = simulate_ldr_literal; } else if (aarch64_insn_is_ldrsw_lit(insn)) { - asi->handler = simulate_ldrsw_literal; + api->handler = simulate_ldrsw_literal; } else { /* * Instruction cannot be stepped out-of-line and we don't @@ -119,6 +119,7 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD_NO_SLOT; } +#ifdef CONFIG_KPROBES static bool __kprobes is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end) { @@ -137,13 +138,13 @@ is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end) return false; } -enum kprobe_insn __kprobes +enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { - enum kprobe_insn decoded; - kprobe_opcode_t insn = le32_to_cpu(*addr); - kprobe_opcode_t *scan_start = addr - 1; - kprobe_opcode_t *scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; + enum probe_insn decoded; + probe_opcode_t insn = le32_to_cpu(*addr); + probe_opcode_t *scan_start = addr - 1; + probe_opcode_t *scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) struct module *mod; #endif @@ -164,7 +165,7 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) preempt_enable(); } #endif - decoded = arm_probe_decode_insn(insn, asi); + decoded = arm_probe_decode_insn(insn, &asi->api); if (decoded == INSN_REJECTED || is_probed_address_atomic(scan_start, scan_end)) @@ -172,3 +173,4 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) return decoded; } +#endif diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index d438289646a6..76d3f315407f 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -23,13 +23,17 @@ */ #define MAX_ATOMIC_CONTEXT_SIZE (128 / sizeof(kprobe_opcode_t)) -enum kprobe_insn { +enum probe_insn { INSN_REJECTED, INSN_GOOD_NO_SLOT, INSN_GOOD, }; -enum kprobe_insn __kprobes +#ifdef CONFIG_KPROBES +enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); +#endif +enum probe_insn __kprobes +arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 4ea8433011d0..40278b937088 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -44,31 +44,31 @@ post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { /* prepare insn slot */ - p->ainsn.insn[0] = cpu_to_le32(p->opcode); + p->ainsn.api.insn[0] = cpu_to_le32(p->opcode); - flush_icache_range((uintptr_t) (p->ainsn.insn), - (uintptr_t) (p->ainsn.insn) + + flush_icache_range((uintptr_t) (p->ainsn.api.insn), + (uintptr_t) (p->ainsn.api.insn) + MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); /* * Needs restoring of return address after stepping xol. */ - p->ainsn.restore = (unsigned long) p->addr + + p->ainsn.api.restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.restore = 0; + p->ainsn.api.restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - if (p->ainsn.handler) - p->ainsn.handler((u32)p->opcode, (long)p->addr, regs); + if (p->ainsn.api.handler) + p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); /* single step simulated, now go for post processing */ post_kprobe_handler(kcb, regs); @@ -98,18 +98,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.insn = NULL; + p->ainsn.api.insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) + p->ainsn.api.insn = get_insn_slot(); + if (!p->ainsn.api.insn) return -ENOMEM; break; }; /* prepare the instruction */ - if (p->ainsn.insn) + if (p->ainsn.api.insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -142,9 +142,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.insn) { - free_insn_slot(p->ainsn.insn, 0); - p->ainsn.insn = NULL; + if (p->ainsn.api.insn) { + free_insn_slot(p->ainsn.api.insn, 0); + p->ainsn.api.insn = NULL; } } @@ -239,9 +239,9 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.insn) { + if (p->ainsn.api.insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.insn; + slot = (unsigned long)p->ainsn.api.insn; set_ss_context(kcb, slot); /* mark pending ss */ @@ -293,8 +293,8 @@ post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) return; /* return addr restore if non-branching insn */ - if (cur->ainsn.restore != 0) - instruction_pointer_set(regs, cur->ainsn.restore); + if (cur->ainsn.api.restore != 0) + instruction_pointer_set(regs, cur->ainsn.api.restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 01f259ec5700..1d76ffb34d40 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -307,7 +307,7 @@ void __init setup_arch(char **cmdline_p) { pr_info("Boot CPU: AArch64 Processor [%08x]\n", read_cpuid_id()); - sprintf(init_utsname()->machine, ELF_PLATFORM); + sprintf(init_utsname()->machine, UTS_MACHINE); init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index c39872a7b03c..19f7e1d6fc24 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -220,7 +220,7 @@ realtime: get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 get_ts_realtime res_sec=x10, res_nsec=x11, \ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - clock_gettime_return, shift=1 + clock_gettime_return shift=1 ALIGN monotonic: @@ -243,7 +243,7 @@ monotonic: clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9 - clock_gettime_return, shift=1 + clock_gettime_return shift=1 ALIGN monotonic_raw: @@ -264,7 +264,7 @@ monotonic_raw: clock_nsec=x15, nsec_to_sec=x9 add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 - clock_gettime_return, shift=1 + clock_gettime_return shift=1 ALIGN realtime_coarse: diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 71c8076bbc60..d0cf2910d808 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -61,7 +61,7 @@ jiffies = jiffies_64; #define TRAMP_TEXT \ . = ALIGN(PAGE_SIZE); \ VMLINUX_SYMBOL(__entry_tramp_text_start) = .; \ - *(.entry.tramp.text) \ + KEEP(*(.entry.tramp.text)) \ . = ALIGN(PAGE_SIZE); \ VMLINUX_SYMBOL(__entry_tramp_text_end) = .; #else @@ -115,7 +115,8 @@ SECTIONS *(.discard) *(.discard.*) *(.interp .dynamic) - *(.dynsym .dynstr .hash) + *(.dynsym .dynstr .hash .gnu.hash) + *(.eh_frame) } . = KIMAGE_VADDR + TEXT_OFFSET; @@ -178,18 +179,18 @@ SECTIONS . = ALIGN(4); .altinstructions : { __alt_instructions = .; - *(.altinstructions) + KEEP(*(.altinstructions)) __alt_instructions_end = .; } .altinstr_replacement : { - *(.altinstr_replacement) + KEEP(*(.altinstr_replacement)) } - .rela : ALIGN(8) { + .rela.dyn : ALIGN(8) { *(.rela .rela*) } - __rela_offset = ABSOLUTE(ADDR(.rela) - KIMAGE_VADDR); - __rela_size = SIZEOF(.rela); + __rela_offset = ABSOLUTE(ADDR(.rela.dyn) - KIMAGE_VADDR); + __rela_size = SIZEOF(.rela.dyn); . = ALIGN(SEGMENT_ALIGN); __init_end = .; diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a5272c07d1cb..9408780f4479 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -25,7 +25,6 @@ config KVM depends on !ARM64_16K_PAGES select MMU_NOTIFIER select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 67613937711f..dfedd4ab1a76 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -68,9 +68,8 @@ stp \ptr, \regB, [\regC], \val .endm - .weak memcpy ENTRY(__memcpy) -ENTRY(memcpy) +WEAK(memcpy) #include "copy_template.S" ret ENDPIPROC(memcpy) diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S index a5a4459013b1..e3de8f05c21a 100644 --- a/arch/arm64/lib/memmove.S +++ b/arch/arm64/lib/memmove.S @@ -57,9 +57,8 @@ C_h .req x12 D_l .req x13 D_h .req x14 - .weak memmove ENTRY(__memmove) -ENTRY(memmove) +WEAK(memmove) cmp dstin, src b.lo __memcpy add tmp1, src, count diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S index f2670a9f218c..316263c47c00 100644 --- a/arch/arm64/lib/memset.S +++ b/arch/arm64/lib/memset.S @@ -54,9 +54,8 @@ dst .req x8 tmp3w .req w9 tmp3 .req x9 - .weak memset ENTRY(__memset) -ENTRY(memset) +WEAK(memset) mov dst, dstin /* Preserve return value. */ and A_lw, val, #255 orr A_lw, A_lw, A_lw, lsl #8 diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 2ae12825529f..078ae1a95965 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -18,7 +18,6 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select PREEMPT_NOTIFIERS - select ANON_INODES select KVM_MMIO select SRCU ---help--- diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index c2024ac9d4e8..93a4b562aa5b 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -19,7 +19,6 @@ if VIRTUALIZATION config KVM bool select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_EVENTFD select SRCU diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index b126ce49ae7b..03b5f6bf2584 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -713,7 +713,7 @@ static void cmm_exit(void) * Return value: * 0 on success / other on failure **/ -static int cmm_set_disable(const char *val, struct kernel_param *kp) +static int cmm_set_disable(const char *val, const struct kernel_param *kp) { int disable = simple_strtoul(val, NULL, 10); diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 5fce52cf0e57..6c39e24abe84 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM prompt "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_EVENTFD select KVM_ASYNC_PF diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c index 9cfa2ffaa9d6..9e6ae335f7da 100644 --- a/arch/s390/oprofile/init.c +++ b/arch/s390/oprofile/init.c @@ -51,7 +51,7 @@ enum __force_cpu_type { }; static int force_cpu_type; -static int set_cpu_type(const char *str, struct kernel_param *kp) +static int set_cpu_type(const char *str, const struct kernel_param *kp) { if (!strcmp(str, "timer")) { force_cpu_type = timer; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 51601246461a..38a0643783bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,7 +19,6 @@ config X86 def_bool y select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI - select ANON_INODES select ARCH_CLOCKSOURCE_DATA select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index e62f4401e792..a63934de4e58 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -383,3 +383,5 @@ 374 i386 userfaultfd sys_userfaultfd 375 i386 membarrier sys_membarrier 376 i386 mlock2 sys_mlock2 +424 i386 pidfd_send_signal sys_pidfd_send_signal +434 i386 pidfd_open sys_pidfd_open diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index aa51cdd2a11a..e0e52d5fcda8 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -332,6 +332,8 @@ 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 +424 common pidfd_send_signal sys_pidfd_send_signal +434 common pidfd_open sys_pidfd_open # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 53b7f53f6207..c88de752cabc 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -26,7 +26,6 @@ config KVM depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER - select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select IRQ_BYPASS_MANAGER diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f24bd7249536..a8bb10a78bc7 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -614,7 +614,7 @@ enum __force_cpu_type { static int force_cpu_type; -static int set_cpu_type(const char *str, struct kernel_param *kp) +static int set_cpu_type(const char *str, const struct kernel_param *kp) { if (!strcmp(str, "timer")) { force_cpu_type = timer; diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 3096c087b732..d5d26464cac8 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1608,7 +1608,8 @@ error: return -ENODEV; } -static int param_set_event_clearing(const char *val, struct kernel_param *kp) +static int param_set_event_clearing(const char *val, + const struct kernel_param *kp) { int result = 0; @@ -1626,7 +1627,8 @@ static int param_set_event_clearing(const char *val, struct kernel_param *kp) return result; } -static int param_get_event_clearing(char *buffer, struct kernel_param *kp) +static int param_get_event_clearing(char *buffer, + const struct kernel_param *kp) { switch (ec_event_clearing) { case ACPI_EC_EVT_TIMING_STATUS: diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 13c308323e94..764047d65d04 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -227,7 +227,8 @@ module_param_cb(trace_method_name, ¶m_ops_trace_method, &trace_method_name, module_param_cb(trace_debug_layer, ¶m_ops_trace_attrib, &acpi_gbl_trace_dbg_layer, 0644); module_param_cb(trace_debug_level, ¶m_ops_trace_attrib, &acpi_gbl_trace_dbg_level, 0644); -static int param_set_trace_state(const char *val, struct kernel_param *kp) +static int param_set_trace_state(const char *val, + const struct kernel_param *kp) { acpi_status status; const char *method = trace_method_name; @@ -263,7 +264,7 @@ static int param_set_trace_state(const char *val, struct kernel_param *kp) return 0; } -static int param_get_trace_state(char *buffer, struct kernel_param *kp) +static int param_get_trace_state(char *buffer, const struct kernel_param *kp) { if (!(acpi_gbl_trace_flags & ACPI_TRACE_ENABLED)) return sprintf(buffer, "disable"); @@ -292,7 +293,8 @@ MODULE_PARM_DESC(aml_debug_output, "To enable/disable the ACPI Debug Object output."); /* /sys/module/acpi/parameters/acpica_version */ -static int param_get_acpica_version(char *buffer, struct kernel_param *kp) +static int param_get_acpica_version(char *buffer, + const struct kernel_param *kp) { int result; diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 08e75d72ddff..7c584e2ea476 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -147,7 +147,7 @@ static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; static int binder_set_stop_on_user_error(const char *val, - struct kernel_param *kp) + const struct kernel_param *kp) { int ret; @@ -1968,6 +1968,18 @@ static int binder_inc_ref_for_node(struct binder_proc *proc, } ret = binder_inc_ref_olocked(ref, strong, target_list); *rdata = ref->data; + if (ret && ref == new_ref) { + /* + * Cleanup the failed reference here as the target + * could now be dead and have already released its + * references by now. Calling on the new reference + * with strong=0 and a tmp_refs will not decrement + * the node. The new_ref gets kfree'd below. + */ + binder_cleanup_ref_olocked(new_ref); + ref = NULL; + } + binder_proc_unlock(proc); if (new_ref && ref != new_ref) /* diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 602cbb04bee8..a4e8a2d26f6e 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -239,7 +239,6 @@ source "drivers/base/regmap/Kconfig" config DMA_SHARED_BUFFER bool default n - select ANON_INODES help This option enables the framework for buffer-sharing between multiple drivers. A buffer is associated with a file using driver diff --git a/drivers/char/adsprpc.c b/drivers/char/adsprpc.c index b5b239eb6dc3..69bfaa0bc6f4 100644 --- a/drivers/char/adsprpc.c +++ b/drivers/char/adsprpc.c @@ -2747,6 +2747,7 @@ static ssize_t fastrpc_debugfs_read(struct file *filp, char __user *buffer, char *fileinfo = NULL; char single_line[UL_SIZE] = "----------------"; char title[UL_SIZE] = "========================="; + unsigned long irq_flags = 0; fileinfo = kzalloc(DEBUGFS_SIZE, GFP_KERNEL); if (!fileinfo) @@ -2809,6 +2810,7 @@ static ssize_t fastrpc_debugfs_read(struct file *filp, char __user *buffer, len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "%s%s%s%s%s\n", single_line, single_line, single_line, single_line, single_line); + spin_lock_irqsave(&me->hlock, irq_flags); hlist_for_each_entry_safe(gmaps, n, &me->maps, hn) { len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "%-20d|0x%-18llX|0x%-18X|0x%-20lX\n\n", @@ -2816,18 +2818,21 @@ static ssize_t fastrpc_debugfs_read(struct file *filp, char __user *buffer, (uint32_t)gmaps->size, gmaps->va); } + spin_unlock_irqrestore(&me->hlock, irq_flags); len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "%-20s|%-20s|%-20s|%-20s\n", "len", "refs", "raddr", "flags"); len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "%s%s%s%s%s\n", single_line, single_line, single_line, single_line, single_line); + spin_lock_irqsave(&me->hlock, irq_flags); hlist_for_each_entry_safe(gmaps, n, &me->maps, hn) { len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "0x%-18X|%-20d|%-20lu|%-20u\n", (uint32_t)gmaps->len, gmaps->refs, gmaps->raddr, gmaps->flags); } + spin_unlock_irqrestore(&me->hlock, irq_flags); } else { len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "\n%s %13s %d\n", "cid", ":", fl->cid); @@ -2869,6 +2874,7 @@ static ssize_t fastrpc_debugfs_read(struct file *filp, char __user *buffer, "%s%s%s%s%s\n", single_line, single_line, single_line, single_line, single_line); + mutex_lock(&fl->map_mutex); hlist_for_each_entry_safe(map, n, &fl->maps, hn) { len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "0x%-20lX|0x%-20llX|0x%-20zu\n\n", @@ -2878,6 +2884,7 @@ static ssize_t fastrpc_debugfs_read(struct file *filp, char __user *buffer, "%s %d\n\n", "DEV_MINOR:", fl->dev_minor); } + mutex_unlock(&fl->map_mutex); len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "%-20s|%-20s|%-20s|%-20s\n", "len", "refs", @@ -2886,23 +2893,27 @@ static ssize_t fastrpc_debugfs_read(struct file *filp, char __user *buffer, "%s%s%s%s%s\n", single_line, single_line, single_line, single_line, single_line); + mutex_lock(&fl->map_mutex); hlist_for_each_entry_safe(map, n, &fl->maps, hn) { len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "%-20zu|%-20d|0x%-20lX|%-20d\n\n", map->len, map->refs, map->raddr, map->uncached); } + mutex_unlock(&fl->map_mutex); len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "%-20s|%-20s\n", "secure", "attr"); len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "%s%s%s%s%s\n", single_line, single_line, single_line, single_line, single_line); + mutex_lock(&fl->map_mutex); hlist_for_each_entry_safe(map, n, &fl->maps, hn) { len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "%-20d|0x%-20lX\n\n", map->secure, map->attr); } + mutex_unlock(&fl->map_mutex); len += scnprintf(fileinfo + len, DEBUGFS_SIZE - len, "\n%s %s %s\n", title, " LIST OF PENDING SMQCONTEXTS ", title); diff --git a/drivers/char/diag/diag_dci.c b/drivers/char/diag/diag_dci.c index 973bc3b1c5b5..ed52ebbb786a 100644 --- a/drivers/char/diag/diag_dci.c +++ b/drivers/char/diag/diag_dci.c @@ -1,4 +1,5 @@ /* Copyright (c) 2012-2021, The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -1615,7 +1616,6 @@ static int diag_send_dci_pkt(struct diag_cmd_reg_t *entry, return -EIO; } - mutex_lock(&driver->dci_mutex); /* prepare DCI packet */ header.start = CONTROL_CHAR; header.version = 1; @@ -1634,7 +1634,6 @@ static int diag_send_dci_pkt(struct diag_cmd_reg_t *entry, diag_update_pkt_buffer(driver->apps_dci_buf, write_len, DCI_PKT_TYPE); diag_update_sleeping_process(entry->pid, DCI_PKT_TYPE); - mutex_unlock(&driver->dci_mutex); return DIAG_DCI_NO_ERROR; } @@ -1654,7 +1653,6 @@ static int diag_send_dci_pkt(struct diag_cmd_reg_t *entry, entry->proc); status = DIAG_DCI_SEND_DATA_FAIL; } - mutex_unlock(&driver->dci_mutex); return status; } @@ -1734,7 +1732,16 @@ static int diag_send_dci_pkt_remote(unsigned char *data, int len, int tag, write_len += dci_header_size; *(int *)(buf + write_len) = tag; write_len += sizeof(int); - memcpy(buf + write_len, data, len); + if ((write_len + len) < DIAG_MDM_BUF_SIZE) { + memcpy(buf + write_len, data, len); + } else { + pr_err("diag: skip writing invalid length packet, token: %d, pkt_len: %d\n", + token, (write_len + len)); + spin_lock_irqsave(&driver->dci_mempool_lock, flags); + diagmem_free(driver, buf, dci_ops_tbl[token].mempool); + spin_unlock_irqrestore(&driver->dci_mempool_lock, flags); + return -EAGAIN; + } write_len += len; *(buf + write_len) = CONTROL_CHAR; /* End Terminator */ write_len += sizeof(uint8_t); @@ -2115,8 +2122,11 @@ static int diag_process_dci_pkt_rsp(unsigned char *buf, int len) if (temp_entry) { reg_item = container_of(temp_entry, struct diag_cmd_reg_t, entry); - ret = diag_send_dci_pkt(reg_item, req_buf, req_len, + mutex_lock(&driver->dci_mutex); + if (req_entry) + ret = diag_send_dci_pkt(reg_item, req_buf, req_len, req_entry->tag); + mutex_unlock(&driver->dci_mutex); } else { DIAG_LOG(DIAG_DEBUG_DCI, "Command not found: %02x %02x %02x\n", reg_entry.cmd_code, reg_entry.subsys_id, diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c index 9f2e3be2c5b8..676c910e990f 100644 --- a/drivers/char/ipmi/ipmi_poweroff.c +++ b/drivers/char/ipmi/ipmi_poweroff.c @@ -66,7 +66,7 @@ static void (*specific_poweroff_func)(ipmi_user_t user); /* Holds the old poweroff function so we can restore it on removal. */ static void (*old_poweroff_func)(void); -static int set_param_ifnum(const char *val, struct kernel_param *kp) +static int set_param_ifnum(const char *val, const struct kernel_param *kp) { int rv = param_set_int(val, kp); if (rv) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 2f8ff63bbbe4..1ad69e8d6d6d 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -1357,7 +1357,7 @@ static unsigned int num_slave_addrs; #define IPMI_MEM_ADDR_SPACE 1 static char *addr_space_to_str[] = { "i/o", "mem" }; -static int hotmod_handler(const char *val, struct kernel_param *kp); +static int hotmod_handler(const char *val, const struct kernel_param *kp); module_param_call(hotmod, hotmod_handler, NULL, NULL, 0200); MODULE_PARM_DESC(hotmod, "Add and remove interfaces. See" @@ -1814,7 +1814,7 @@ static struct smi_info *smi_info_alloc(void) return info; } -static int hotmod_handler(const char *val, struct kernel_param *kp) +static int hotmod_handler(const char *val, const struct kernel_param *kp) { char *str = kstrdup(val, GFP_KERNEL); int rv; diff --git a/drivers/crypto/msm/qce50.c b/drivers/crypto/msm/qce50.c index 7740a8c59126..3f8c512e0cd9 100644 --- a/drivers/crypto/msm/qce50.c +++ b/drivers/crypto/msm/qce50.c @@ -1,6 +1,6 @@ /* Qualcomm Crypto Engine driver. * - * Copyright (c) 2012-2018, 2020 The Linux Foundation. All rights reserved. + * Copyright (c) 2012-2018, 2020-2021 The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -866,6 +866,11 @@ static int _ce_setup_cipher(struct qce_device *pce_dev, struct qce_req *creq, break; case CIPHER_ALG_3DES: if (creq->mode != QCE_MODE_ECB) { + if (ivsize > MAX_IV_LENGTH) { + pr_err("%s: error: Invalid length parameter\n", + __func__); + return -EINVAL; + } _byte_stream_to_net_words(enciv32, creq->iv, ivsize); pce = cmdlistinfo->encr_cntr_iv; pce->data = enciv32[0]; @@ -914,6 +919,11 @@ static int _ce_setup_cipher(struct qce_device *pce_dev, struct qce_req *creq, } } if (creq->mode != QCE_MODE_ECB) { + if (ivsize > MAX_IV_LENGTH) { + pr_err("%s: error: Invalid length parameter\n", + __func__); + return -EINVAL; + } if (creq->mode == QCE_MODE_XTS) _byte_stream_swap_to_net_words(enciv32, creq->iv, ivsize); diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index d459cf4b8579..4aaf54132980 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -50,7 +50,7 @@ unsigned int edac_mc_get_poll_msec(void) return edac_mc_poll_msec; } -static int edac_set_poll_msec(const char *val, struct kernel_param *kp) +static int edac_set_poll_msec(const char *val, const struct kernel_param *kp) { unsigned int i; int ret; diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c index 9cb082a19d8a..d93afc585be4 100644 --- a/drivers/edac/edac_module.c +++ b/drivers/edac/edac_module.c @@ -19,7 +19,8 @@ #ifdef CONFIG_EDAC_DEBUG -static int edac_set_debug_level(const char *buf, struct kernel_param *kp) +static int edac_set_debug_level(const char *buf, + const struct kernel_param *kp) { unsigned long val; int ret; diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 832df3c58e2f..b858494a0a55 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -18,6 +18,11 @@ #include "efistub.h" +bool efi__get___nokaslr(void) +{ + return nokaslr(); +} + static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg) { static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID; diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index e33593ed8f84..1f84f9ac5430 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -38,7 +38,7 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg, u64 phys_seed = 0; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - if (!nokaslr()) { + if (!efi__get___nokaslr()) { status = efi_get_random_bytes(sys_table_arg, sizeof(phys_seed), (u8 *)&phys_seed); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index a5eaa3ac0a5d..f1907c792a52 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -9,6 +9,8 @@ extern int __pure nokaslr(void); void efi_char16_printk(efi_system_table_t *, efi_char16_t *); +bool efi__get___nokaslr(void); + efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image, void **__fh); diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c index 7c9a801472e5..cd44789da808 100644 --- a/drivers/gpu/msm/adreno.c +++ b/drivers/gpu/msm/adreno.c @@ -1,4 +1,5 @@ -/* Copyright (c) 2002,2007-2019, The Linux Foundation. All rights reserved. +/* Copyright (c) 2002,2007-2020, The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -1016,6 +1017,8 @@ static int adreno_probe(struct platform_device *pdev) adreno_debugfs_init(adreno_dev); adreno_profile_init(adreno_dev); + adreno_dev->perfcounter = false; + adreno_sysfs_init(adreno_dev); kgsl_pwrscale_init(&pdev->dev, CONFIG_QCOM_ADRENO_DEFAULT_GOVERNOR); diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h index f433cdef1827..2af892671ee5 100644 --- a/drivers/gpu/msm/adreno.h +++ b/drivers/gpu/msm/adreno.h @@ -1,4 +1,5 @@ -/* Copyright (c) 2008-2018 The Linux Foundation. All rights reserved. +/* Copyright (c) 2008-2018,2020 The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -439,6 +440,12 @@ struct adreno_device { struct list_head active_list; spinlock_t active_list_lock; + + /* + * @perfcounter: Flag to clear perfcounters across contexts and + * controls perfcounter ioctl read + */ + bool perfcounter; }; /** diff --git a/drivers/gpu/msm/adreno_compat.c b/drivers/gpu/msm/adreno_compat.c index 5a8d587d4536..faaec90ba74a 100644 --- a/drivers/gpu/msm/adreno_compat.c +++ b/drivers/gpu/msm/adreno_compat.c @@ -1,4 +1,5 @@ /* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -229,6 +230,14 @@ static long adreno_ioctl_perfcounter_read_compat( struct kgsl_perfcounter_read_compat *read32 = data; struct kgsl_perfcounter_read read; + /* + * When performance counter zapping is enabled, the counters are cleared + * across context switches. Reading the counters when they are zapped is + * not permitted. + */ + if (!adreno_dev->perfcounter) + return -EPERM; + read.reads = (struct kgsl_perfcounter_read_group __user *) (uintptr_t)read32->reads; read.count = read32->count; diff --git a/drivers/gpu/msm/adreno_dispatch.c b/drivers/gpu/msm/adreno_dispatch.c index 5209f1b2a2e6..3c804ca09371 100644 --- a/drivers/gpu/msm/adreno_dispatch.c +++ b/drivers/gpu/msm/adreno_dispatch.c @@ -1,4 +1,5 @@ /* Copyright (c) 2013-2020, The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -1149,7 +1150,8 @@ static inline bool _verify_ib(struct kgsl_device_private *dev_priv, } /* Make sure that the address is mapped */ - if (!kgsl_mmu_gpuaddr_in_range(private->pagetable, ib->gpuaddr)) { + if (!kgsl_mmu_gpuaddr_in_range(private->pagetable, ib->gpuaddr, + ib->size)) { pr_context(device, context, "ctxt %d invalid ib gpuaddr %llX\n", context->id, ib->gpuaddr); return false; diff --git a/drivers/gpu/msm/adreno_ioctl.c b/drivers/gpu/msm/adreno_ioctl.c index 0d5e3e094c36..579ba2b61863 100644 --- a/drivers/gpu/msm/adreno_ioctl.c +++ b/drivers/gpu/msm/adreno_ioctl.c @@ -1,4 +1,5 @@ /* Copyright (c) 2002,2007-2016, The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -76,6 +77,14 @@ static long adreno_ioctl_perfcounter_read(struct kgsl_device_private *dev_priv, struct adreno_device *adreno_dev = ADRENO_DEVICE(dev_priv->device); struct kgsl_perfcounter_read *read = data; + /* + * When performance counter zapping is enabled, the counters are cleared + * across context switches. Reading the counters when they are zapped is + * not permitted. + */ + if (!adreno_dev->perfcounter) + return -EPERM; + return (long) adreno_perfcounter_read_group(adreno_dev, read->reads, read->count); } diff --git a/drivers/gpu/msm/adreno_sysfs.c b/drivers/gpu/msm/adreno_sysfs.c index f893b29e816e..a6c179f7c932 100644 --- a/drivers/gpu/msm/adreno_sysfs.c +++ b/drivers/gpu/msm/adreno_sysfs.c @@ -1,4 +1,5 @@ /* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -233,6 +234,31 @@ static unsigned int _lm_show(struct adreno_device *adreno_dev) return test_bit(ADRENO_LM_CTRL, &adreno_dev->pwrctrl_flag); } +static unsigned int _perfcounter_show(struct adreno_device *adreno_dev) +{ + return adreno_dev->perfcounter; +} + +static int _perfcounter_store(struct adreno_device *adreno_dev, + unsigned int val) +{ + struct kgsl_device *device = KGSL_DEVICE(adreno_dev); + + if (adreno_dev->perfcounter == val) + return 0; + + mutex_lock(&device->mutex); + + /* Power down the GPU before changing the state */ + kgsl_pwrctrl_change_state(device, KGSL_STATE_SUSPEND); + adreno_dev->perfcounter = val; + kgsl_pwrctrl_change_state(device, KGSL_STATE_SLUMBER); + + mutex_unlock(&device->mutex); + + return 0; +} + static ssize_t _sysfs_store_u32(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -327,6 +353,7 @@ static ADRENO_SYSFS_BOOL(lm); static ADRENO_SYSFS_BOOL(preemption); static ADRENO_SYSFS_BOOL(hwcg); static ADRENO_SYSFS_BOOL(throttling); +static ADRENO_SYSFS_BOOL(perfcounter); @@ -343,6 +370,7 @@ static const struct device_attribute *_attr_list[] = { &adreno_attr_preemption.attr, &adreno_attr_hwcg.attr, &adreno_attr_throttling.attr, + &adreno_attr_perfcounter.attr, NULL, }; @@ -499,8 +527,11 @@ int adreno_sysfs_init(struct adreno_device *adreno_dev) ret = kgsl_create_device_sysfs_files(device->dev, _attr_list); /* Add the PPD directory and files */ - if (ret == 0) + if (ret == 0) { + /* Notify userspace */ + kobject_uevent(&device->dev->kobj, KOBJ_ADD); ppd_sysfs_init(adreno_dev); + } return 0; } diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c index bda0afed50b6..4b9f779e525e 100644 --- a/drivers/gpu/msm/kgsl.c +++ b/drivers/gpu/msm/kgsl.c @@ -1,4 +1,5 @@ /* Copyright (c) 2008-2021, The Linux Foundation. All rights reserved. + * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -1273,7 +1274,7 @@ kgsl_sharedmem_find(struct kgsl_process_private *private, uint64_t gpuaddr) if (!private) return NULL; - if (!kgsl_mmu_gpuaddr_in_range(private->pagetable, gpuaddr)) + if (!kgsl_mmu_gpuaddr_in_range(private->pagetable, gpuaddr, 0)) return NULL; spin_lock(&private->mem_lock); @@ -2183,6 +2184,15 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc, unsigned long useraddr) ret = sg_alloc_table_from_pages(memdesc->sgt, pages, npages, 0, memdesc->size, GFP_KERNEL); + + if (ret) + goto out; + + ret = kgsl_cache_range_op(memdesc, 0, memdesc->size, + KGSL_CACHE_OP_FLUSH); + + if (ret) + sg_free_table(memdesc->sgt); out: if (ret) { for (i = 0; i < npages; i++) @@ -2295,10 +2305,27 @@ static int kgsl_setup_dmabuf_useraddr(struct kgsl_device *device, return -EFAULT; } - /* Look for the fd that matches this the vma file */ + /* Look for the fd that matches this vma file */ fd = iterate_fd(current->files, 0, match_file, vma->vm_file); - if (fd != 0) + if (fd) { dmabuf = dma_buf_get(fd - 1); + if (IS_ERR(dmabuf)) { + up_read(¤t->mm->mmap_sem); + return PTR_ERR(dmabuf); + } + /* + * It is possible that the fd obtained from iterate_fd + * was closed before passing the fd to dma_buf_get(). + * Hence dmabuf returned by dma_buf_get() could be + * different from vma->vm_file->private_data. Return + * failure if this happens. + */ + if (dmabuf != vma->vm_file->private_data) { + dma_buf_put(dmabuf); + up_read(¤t->mm->mmap_sem); + return -EBADF; + } + } } if (IS_ERR_OR_NULL(dmabuf)) { diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c index 98537730cbc9..31e8a7ea5f65 100644 --- a/drivers/gpu/msm/kgsl_iommu.c +++ b/drivers/gpu/msm/kgsl_iommu.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2021, The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -2525,20 +2526,21 @@ static int kgsl_iommu_svm_range(struct kgsl_pagetable *pagetable, } static bool kgsl_iommu_addr_in_range(struct kgsl_pagetable *pagetable, - uint64_t gpuaddr) + uint64_t gpuaddr, uint64_t size) { struct kgsl_iommu_pt *pt = pagetable->priv; if (gpuaddr == 0) return false; - if (gpuaddr >= pt->va_start && gpuaddr < pt->va_end) + if (gpuaddr >= pt->va_start && (gpuaddr + size) < pt->va_end) return true; - if (gpuaddr >= pt->compat_va_start && gpuaddr < pt->compat_va_end) + if (gpuaddr >= pt->compat_va_start && + (gpuaddr + size) < pt->compat_va_end) return true; - if (gpuaddr >= pt->svm_start && gpuaddr < pt->svm_end) + if (gpuaddr >= pt->svm_start && (gpuaddr + size) < pt->svm_end) return true; return false; diff --git a/drivers/gpu/msm/kgsl_mmu.c b/drivers/gpu/msm/kgsl_mmu.c index d0a45e713d0d..228f3396ae90 100644 --- a/drivers/gpu/msm/kgsl_mmu.c +++ b/drivers/gpu/msm/kgsl_mmu.c @@ -1,4 +1,5 @@ /* Copyright (c) 2002,2007-2017,2021, The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -606,10 +607,11 @@ enum kgsl_mmutype kgsl_mmu_get_mmutype(struct kgsl_device *device) EXPORT_SYMBOL(kgsl_mmu_get_mmutype); bool kgsl_mmu_gpuaddr_in_range(struct kgsl_pagetable *pagetable, - uint64_t gpuaddr) + uint64_t gpuaddr, uint64_t size) { if (PT_OP_VALID(pagetable, addr_in_range)) - return pagetable->pt_ops->addr_in_range(pagetable, gpuaddr); + return pagetable->pt_ops->addr_in_range(pagetable, + gpuaddr, size); return false; } @@ -645,7 +647,7 @@ EXPORT_SYMBOL(kgsl_mmu_get_qtimer_global_entry); */ static bool nommu_gpuaddr_in_range(struct kgsl_pagetable *pagetable, - uint64_t gpuaddr) + uint64_t gpuaddr, uint64_t size) { return (gpuaddr != 0) ? true : false; } diff --git a/drivers/gpu/msm/kgsl_mmu.h b/drivers/gpu/msm/kgsl_mmu.h index 505fe591a53e..fcf3b188f9f7 100644 --- a/drivers/gpu/msm/kgsl_mmu.h +++ b/drivers/gpu/msm/kgsl_mmu.h @@ -1,4 +1,5 @@ /* Copyright (c) 2002,2007-2017, The Linux Foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -99,7 +100,8 @@ struct kgsl_mmu_pt_ops { int (*set_svm_region)(struct kgsl_pagetable *, uint64_t, uint64_t); int (*svm_range)(struct kgsl_pagetable *, uint64_t *, uint64_t *, uint64_t); - bool (*addr_in_range)(struct kgsl_pagetable *pagetable, uint64_t); + bool (*addr_in_range)(struct kgsl_pagetable *pagetable, + uint64_t, uint64_t); int (*mmu_map_offset)(struct kgsl_pagetable *pt, uint64_t virtaddr, uint64_t virtoffset, struct kgsl_memdesc *memdesc, uint64_t physoffset, @@ -187,7 +189,8 @@ unsigned int kgsl_virtaddr_to_physaddr(void *virtaddr); unsigned int kgsl_mmu_log_fault_addr(struct kgsl_mmu *mmu, u64 ttbr0, uint64_t addr); enum kgsl_mmutype kgsl_mmu_get_mmutype(struct kgsl_device *device); -bool kgsl_mmu_gpuaddr_in_range(struct kgsl_pagetable *pt, uint64_t gpuaddr); +bool kgsl_mmu_gpuaddr_in_range(struct kgsl_pagetable *pt, uint64_t gpuaddr, + uint64_t size); int kgsl_mmu_get_region(struct kgsl_pagetable *pagetable, uint64_t gpuaddr, uint64_t size); diff --git a/drivers/gpu/msm/kgsl_pool.c b/drivers/gpu/msm/kgsl_pool.c index cd5f8e3efe98..77bb50ded36f 100644 --- a/drivers/gpu/msm/kgsl_pool.c +++ b/drivers/gpu/msm/kgsl_pool.c @@ -1,4 +1,5 @@ /* Copyright (c) 2016-2017, The Linux Foundation. All rights reserved. + * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -83,6 +84,15 @@ _kgsl_pool_zero_page(struct page *p, unsigned int pool_order) static void _kgsl_pool_add_page(struct kgsl_page_pool *pool, struct page *p) { + /* + * Sanity check to make sure we don't re-pool a page that + * somebody else has a reference to. + */ + if (WARN_ON_ONCE(unlikely(page_count(p) > 1))) { + __free_pages(p, pool->pool_order); + return; + } + _kgsl_pool_zero_page(p, pool->pool_order); spin_lock(&pool->list_lock); diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c index d15e824e39df..de969713ce59 100644 --- a/drivers/hid/hid-magicmouse.c +++ b/drivers/hid/hid-magicmouse.c @@ -34,7 +34,8 @@ module_param(emulate_scroll_wheel, bool, 0644); MODULE_PARM_DESC(emulate_scroll_wheel, "Emulate a scroll wheel"); static unsigned int scroll_speed = 32; -static int param_set_scroll_speed(const char *val, struct kernel_param *kp) { +static int param_set_scroll_speed(const char *val, + const struct kernel_param *kp) { unsigned long speed; if (!val || kstrtoul(val, 0, &speed) || speed > 63) return -EINVAL; diff --git a/drivers/hwtracing/coresight/coresight-event.c b/drivers/hwtracing/coresight/coresight-event.c index 0bced010d4c5..8e105af3e4e3 100644 --- a/drivers/hwtracing/coresight/coresight-event.c +++ b/drivers/hwtracing/coresight/coresight-event.c @@ -20,12 +20,13 @@ #include <trace/events/exception.h> static int event_abort_enable; -static int event_abort_set(const char *val, struct kernel_param *kp); +static int event_abort_set(const char *val, const struct kernel_param *kp); module_param_call(event_abort_enable, event_abort_set, param_get_int, &event_abort_enable, 0644); static int event_abort_early_panic = 1; -static int event_abort_on_panic_set(const char *val, struct kernel_param *kp); +static int event_abort_on_panic_set(const char *val, + const struct kernel_param *kp); module_param_call(event_abort_early_panic, event_abort_on_panic_set, param_get_int, &event_abort_early_panic, 0644); @@ -96,7 +97,7 @@ static void event_abort_unregister(void) unregister_trace_unhandled_abort(event_abort_unhandled_abort, NULL); } -static int event_abort_set(const char *val, struct kernel_param *kp) +static int event_abort_set(const char *val, const struct kernel_param *kp) { int ret; @@ -114,7 +115,8 @@ static int event_abort_set(const char *val, struct kernel_param *kp) return ret; } -static int event_abort_on_panic_set(const char *val, struct kernel_param *kp) +static int event_abort_on_panic_set(const char *val, + const struct kernel_param *kp) { int ret; diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c index f086ef387475..80f5b8c3b004 100644 --- a/drivers/ide/ide.c +++ b/drivers/ide/ide.c @@ -244,7 +244,7 @@ struct chs_geom { static unsigned int ide_disks; static struct chs_geom ide_disks_chs[MAX_HWIFS * MAX_DRIVES]; -static int ide_set_disk_chs(const char *str, struct kernel_param *kp) +static int ide_set_disk_chs(const char *str, const struct kernel_param *kp) { int a, b, c = 0, h = 0, s = 0, i, j = 1; @@ -328,7 +328,7 @@ static void ide_dev_apply_params(ide_drive_t *drive, u8 unit) static unsigned int ide_ignore_cable; -static int ide_set_ignore_cable(const char *s, struct kernel_param *kp) +static int ide_set_ignore_cable(const char *s, const struct kernel_param *kp) { int i, j = 1; diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig index 66792e707d74..46dd8288e294 100644 --- a/drivers/iio/Kconfig +++ b/drivers/iio/Kconfig @@ -4,7 +4,6 @@ menuconfig IIO tristate "Industrial I/O support" - select ANON_INODES help The industrial I/O subsystem provides a unified framework for drivers for many different types of embedded sensors using a diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index c151bb625179..0b2ba086d198 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -23,7 +23,6 @@ config INFINIBAND_USER_MAD config INFINIBAND_USER_ACCESS tristate "InfiniBand userspace access (verbs and CM)" - select ANON_INODES ---help--- Userspace InfiniBand access support. This enables the kernel side of userspace verbs and the userspace diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 77cc77ba998f..49691557f798 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -150,7 +150,7 @@ static struct kparam_string kp_txselect = { .string = txselect_list, .maxlen = MAX_ATTEN_LEN }; -static int setup_txselect(const char *, struct kernel_param *); +static int setup_txselect(const char *, const struct kernel_param *); module_param_call(txselect, setup_txselect, param_get_string, &kp_txselect, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(txselect, @@ -6194,7 +6194,7 @@ static void set_no_qsfp_atten(struct qib_devdata *dd, int change) } /* handle the txselect parameter changing */ -static int setup_txselect(const char *str, struct kernel_param *kp) +static int setup_txselect(const char *str, const struct kernel_param *kp) { struct qib_devdata *dd; unsigned long val; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 4173fe977721..f8ae5d9a7729 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -80,7 +80,7 @@ module_param(srpt_srq_size, int, 0444); MODULE_PARM_DESC(srpt_srq_size, "Shared receive queue (SRQ) size."); -static int srpt_get_u64_x(char *buffer, struct kernel_param *kp) +static int srpt_get_u64_x(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg); } diff --git a/drivers/isdn/hardware/mISDN/avmfritz.c b/drivers/isdn/hardware/mISDN/avmfritz.c index 292991c90c02..f36f4cb3e7df 100644 --- a/drivers/isdn/hardware/mISDN/avmfritz.c +++ b/drivers/isdn/hardware/mISDN/avmfritz.c @@ -156,7 +156,7 @@ _set_debug(struct fritzcard *card) } static int -set_debug(const char *val, struct kernel_param *kp) +set_debug(const char *val, const struct kernel_param *kp) { int ret; struct fritzcard *card; diff --git a/drivers/isdn/hardware/mISDN/mISDNinfineon.c b/drivers/isdn/hardware/mISDN/mISDNinfineon.c index d0b6377b9834..64060429aa50 100644 --- a/drivers/isdn/hardware/mISDN/mISDNinfineon.c +++ b/drivers/isdn/hardware/mISDN/mISDNinfineon.c @@ -244,7 +244,7 @@ _set_debug(struct inf_hw *card) } static int -set_debug(const char *val, struct kernel_param *kp) +set_debug(const char *val, const struct kernel_param *kp) { int ret; struct inf_hw *card; diff --git a/drivers/isdn/hardware/mISDN/netjet.c b/drivers/isdn/hardware/mISDN/netjet.c index a74741d28ca8..ec3749184451 100644 --- a/drivers/isdn/hardware/mISDN/netjet.c +++ b/drivers/isdn/hardware/mISDN/netjet.c @@ -111,7 +111,7 @@ _set_debug(struct tiger_hw *card) } static int -set_debug(const char *val, struct kernel_param *kp) +set_debug(const char *val, const struct kernel_param *kp) { int ret; struct tiger_hw *card; diff --git a/drivers/isdn/hardware/mISDN/speedfax.c b/drivers/isdn/hardware/mISDN/speedfax.c index 9815bb4eec9c..1f1446ed8d5f 100644 --- a/drivers/isdn/hardware/mISDN/speedfax.c +++ b/drivers/isdn/hardware/mISDN/speedfax.c @@ -94,7 +94,7 @@ _set_debug(struct sfax_hw *card) } static int -set_debug(const char *val, struct kernel_param *kp) +set_debug(const char *val, const struct kernel_param *kp) { int ret; struct sfax_hw *card; diff --git a/drivers/isdn/hardware/mISDN/w6692.c b/drivers/isdn/hardware/mISDN/w6692.c index 741675525b53..796454722610 100644 --- a/drivers/isdn/hardware/mISDN/w6692.c +++ b/drivers/isdn/hardware/mISDN/w6692.c @@ -101,7 +101,7 @@ _set_debug(struct w6692_hw *card) } static int -set_debug(const char *val, struct kernel_param *kp) +set_debug(const char *val, const struct kernel_param *kp) { int ret; struct w6692_hw *card; diff --git a/drivers/md/md.c b/drivers/md/md.c index b05afab6eaf2..8aebcf2a30d2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5100,7 +5100,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) return NULL; } -static int add_named_array(const char *val, struct kernel_param *kp) +static int add_named_array(const char *val, const struct kernel_param *kp) { /* val must be "md_*" where * is not all digits. * We allocate an array with a large free minor number, and @@ -9330,11 +9330,11 @@ static __exit void md_exit(void) subsys_initcall(md_init); module_exit(md_exit) -static int get_ro(char *buffer, struct kernel_param *kp) +static int get_ro(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%d", start_readonly); } -static int set_ro(const char *val, struct kernel_param *kp) +static int set_ro(const char *val, const struct kernel_param *kp) { return kstrtouint(val, 10, (unsigned int *)&start_readonly); } diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp32.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp32.c index caf69af25601..167ea3bc7a7e 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp32.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp32.c @@ -162,12 +162,12 @@ static int32_t msm_vfe32_init_qos_parms(struct vfe_device *vfe_dev, __func__); kfree(ds_settings); kfree(ds_regs); - } else { + } else { for (i = 0; i < ds_entries; i++) msm_camera_io_w(ds_settings[i], vfebase + ds_regs[i]); - kfree(ds_regs); - kfree(ds_settings); + kfree(ds_regs); + kfree(ds_settings); } } else { kfree(ds_regs); diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c index da42439c60d6..e03635b3a833 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c @@ -2271,8 +2271,8 @@ static int msm_isp_process_done_buf(struct vfe_device *vfe_dev, MSM_ISP_BUFFER_STATE_PUT_BUF; buf->buf_debug.put_state_last ^= 1; rc = vfe_dev->buf_mgr->ops->buf_done(vfe_dev->buf_mgr, - buf->bufq_handle, buf->buf_idx, time_stamp, - frame_id, stream_info->runtime_output_format); + buf->bufq_handle, buf->buf_idx, time_stamp, + frame_id, stream_info->runtime_output_format); if (rc == -EFAULT) { msm_isp_halt_send_error(vfe_dev, ISP_EVENT_BUF_FATAL_ERROR); diff --git a/drivers/media/platform/msm/camera_v2/sensor/eeprom/msm_eeprom.c b/drivers/media/platform/msm/camera_v2/sensor/eeprom/msm_eeprom.c index 44a4ad7822cb..6e2eaddbac79 100644 --- a/drivers/media/platform/msm/camera_v2/sensor/eeprom/msm_eeprom.c +++ b/drivers/media/platform/msm/camera_v2/sensor/eeprom/msm_eeprom.c @@ -963,7 +963,7 @@ static int msm_eeprom_get_dt_data(struct msm_eeprom_ctrl_t *e_ctrl) of_node = e_ctrl->pdev->dev.of_node; if (!of_node) { - pr_err("%s: %d of_node is NULL\n", __func__ , __LINE__); + pr_err("%s: %d of_node is NULL\n", __func__, __LINE__); return -ENOMEM; } rc = msm_camera_get_dt_vreg_data(of_node, &power_info->cam_vreg, diff --git a/drivers/media/platform/msm/camera_v2/sensor/io/msm_camera_dt_util.c b/drivers/media/platform/msm/camera_v2/sensor/io/msm_camera_dt_util.c index e0b703321dae..419371a9f318 100644 --- a/drivers/media/platform/msm/camera_v2/sensor/io/msm_camera_dt_util.c +++ b/drivers/media/platform/msm/camera_v2/sensor/io/msm_camera_dt_util.c @@ -144,6 +144,7 @@ int msm_camera_fill_vreg_params(struct camera_vreg_t *cam_vreg, break; } } + break; case CAM_V_CUSTOM2: for (j = 0; j < num_vreg; j++) { diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c index def22b7fef9c..2c4933363135 100644 --- a/drivers/media/usb/uvc/uvc_driver.c +++ b/drivers/media/usb/uvc/uvc_driver.c @@ -2202,7 +2202,7 @@ static int uvc_reset_resume(struct usb_interface *intf) * Module parameters */ -static int uvc_clock_param_get(char *buffer, struct kernel_param *kp) +static int uvc_clock_param_get(char *buffer, const struct kernel_param *kp) { if (uvc_clock_param == CLOCK_MONOTONIC) return sprintf(buffer, "CLOCK_MONOTONIC"); @@ -2210,7 +2210,7 @@ static int uvc_clock_param_get(char *buffer, struct kernel_param *kp) return sprintf(buffer, "CLOCK_REALTIME"); } -static int uvc_clock_param_set(const char *val, struct kernel_param *kp) +static int uvc_clock_param_set(const char *val, const struct kernel_param *kp) { if (strncasecmp(val, "clock_", strlen("clock_")) == 0) val += strlen("clock_"); diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index 207370d68c17..06cb6933b0ea 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -99,7 +99,7 @@ module_param(mpt_channel_mapping, int, 0); MODULE_PARM_DESC(mpt_channel_mapping, " Mapping id's to channels (default=0)"); static int mpt_debug_level; -static int mpt_set_debug_level(const char *val, struct kernel_param *kp); +static int mpt_set_debug_level(const char *val, const struct kernel_param *kp); module_param_call(mpt_debug_level, mpt_set_debug_level, param_get_int, &mpt_debug_level, 0600); MODULE_PARM_DESC(mpt_debug_level, @@ -242,7 +242,7 @@ pci_enable_io_access(struct pci_dev *pdev) pci_write_config_word(pdev, PCI_COMMAND, command_reg); } -static int mpt_set_debug_level(const char *val, struct kernel_param *kp) +static int mpt_set_debug_level(const char *val, const struct kernel_param *kp) { int ret = param_set_int(val, kp); MPT_ADAPTER *ioc; diff --git a/drivers/misc/ibmasm/ibmasm.h b/drivers/misc/ibmasm/ibmasm.h index 9b083448814d..5bd127727d8e 100644 --- a/drivers/misc/ibmasm/ibmasm.h +++ b/drivers/misc/ibmasm/ibmasm.h @@ -211,7 +211,7 @@ void ibmasmfs_unregister(void); void ibmasmfs_add_sp(struct service_processor *sp); /* uart */ -#ifdef CONFIG_SERIAL_8250 +#if IS_ENABLED(CONFIG_SERIAL_8250) void ibmasm_register_uart(struct service_processor *sp); void ibmasm_unregister_uart(struct service_processor *sp); #else diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c index ab2184003c29..71becb5c4dec 100644 --- a/drivers/misc/kgdbts.c +++ b/drivers/misc/kgdbts.c @@ -1127,7 +1127,8 @@ static void kgdbts_put_char(u8 chr) ts.run_test(0, chr); } -static int param_set_kgdbts_var(const char *kmessage, struct kernel_param *kp) +static int param_set_kgdbts_var(const char *kmessage, + const struct kernel_param *kp) { size_t len = strlen(kmessage); diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index e2c0057737e6..b468819bc436 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -431,7 +431,7 @@ static int block2mtd_setup2(const char *val) } -static int block2mtd_setup(const char *val, struct kernel_param *kp) +static int block2mtd_setup(const char *val, const struct kernel_param *kp) { #ifdef MODULE return block2mtd_setup2(val); diff --git a/drivers/mtd/devices/phram.c b/drivers/mtd/devices/phram.c index 9734e6903fe6..d312c1b5a78e 100644 --- a/drivers/mtd/devices/phram.c +++ b/drivers/mtd/devices/phram.c @@ -269,7 +269,7 @@ error: return ret; } -static int phram_param_call(const char *val, struct kernel_param *kp) +static int phram_param_call(const char *val, const struct kernel_param *kp) { #ifdef MODULE return phram_setup(val); diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 2af841cb7097..f45394281354 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -1456,7 +1456,7 @@ static int __init bytes_str_to_int(const char *str) * This function returns zero in case of success and a negative error code in * case of error. */ -static int __init ubi_mtd_param_parse(const char *val, struct kernel_param *kp) +static int __init ubi_mtd_param_parse(const char *val, const struct kernel_param *kp) { int i, len; struct mtd_dev_param *p; diff --git a/drivers/net/wireless/wcnss/wcnss_wlan.c b/drivers/net/wireless/wcnss/wcnss_wlan.c index fbf2b49c7832..e15d4b5a58eb 100644 --- a/drivers/net/wireless/wcnss/wcnss_wlan.c +++ b/drivers/net/wireless/wcnss/wcnss_wlan.c @@ -1655,7 +1655,7 @@ EXPORT_SYMBOL(wcnss_get_wlan_mac_address); static int enable_wcnss_suspend_notify; static int enable_wcnss_suspend_notify_set(const char *val, - struct kernel_param *kp) + const struct kernel_param *kp) { int ret; diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 04bb9b1b14da..12720e807117 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -1,7 +1,7 @@ /* * Device tree based initialization code for reserved memory. * - * Copyright (c) 2013, 2015 The Linux Foundation. All Rights Reserved. + * Copyright (c) 2013, 2015, 2017 The Linux Foundation. All Rights Reserved. * Copyright (c) 2013,2014 Samsung Electronics Co., Ltd. * http://www.samsung.com * Author: Marek Szyprowski <m.szyprowski@samsung.com> @@ -22,7 +22,7 @@ #include <linux/of_reserved_mem.h> #include <linux/sort.h> -#define MAX_RESERVED_REGIONS 16 +#define MAX_RESERVED_REGIONS 32 static struct reserved_mem reserved_mem[MAX_RESERVED_REGIONS]; static int reserved_mem_count; diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index a098f8324afd..4f7f3ddcea06 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -772,7 +772,8 @@ void pci_disable_link_state(struct pci_dev *pdev, int state) } EXPORT_SYMBOL(pci_disable_link_state); -static int pcie_aspm_set_policy(const char *val, struct kernel_param *kp) +static int pcie_aspm_set_policy(const char *val, + const struct kernel_param *kp) { int i; struct pcie_link_state *link; @@ -799,7 +800,7 @@ static int pcie_aspm_set_policy(const char *val, struct kernel_param *kp) return 0; } -static int pcie_aspm_get_policy(char *buffer, struct kernel_param *kp) +static int pcie_aspm_get_policy(char *buffer, const struct kernel_param *kp) { int i, cnt = 0; for (i = 0; i < ARRAY_SIZE(policy_str); i++) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index d6d671a925e1..0111dad43c9a 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -1047,7 +1047,7 @@ int arm_pmu_device_probe(struct platform_device *pdev, const struct pmu_probe_info *probe_table) { const struct of_device_id *of_id; - const int (*init_fn)(struct arm_pmu *); + int (*init_fn)(struct arm_pmu *); struct device_node *node = pdev->dev.of_node; struct arm_pmu *pmu; int ret = -ENODEV; diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 466a0d0162c3..c345136022bf 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -9247,7 +9247,7 @@ static struct ibm_init_struct ibms_init[] __initdata = { }, }; -static int __init set_ibm_param(const char *val, struct kernel_param *kp) +static int __init set_ibm_param(const char *val, const struct kernel_param *kp) { unsigned int i; struct ibm_struct *ibm; diff --git a/drivers/power/reset/msm-poweroff.c b/drivers/power/reset/msm-poweroff.c index c14bc93de717..f267ec9d80c0 100644 --- a/drivers/power/reset/msm-poweroff.c +++ b/drivers/power/reset/msm-poweroff.c @@ -94,7 +94,7 @@ static void *kaslr_imem_addr; #endif static bool scm_dload_supported; -static int dload_set(const char *val, struct kernel_param *kp); +static int dload_set(const char *val, const struct kernel_param *kp); /* interface for exporting attributes */ struct reset_attribute { struct attribute attr; @@ -183,7 +183,7 @@ static void enable_emergency_dload_mode(void) pr_err("Failed to set secure EDLOAD mode: %d\n", ret); } -static int dload_set(const char *val, struct kernel_param *kp) +static int dload_set(const char *val, const struct kernel_param *kp) { int ret; int old_val = download_mode; diff --git a/drivers/scsi/fcoe/fcoe_transport.c b/drivers/scsi/fcoe/fcoe_transport.c index d7597c08fa11..a70fa0eb3871 100644 --- a/drivers/scsi/fcoe/fcoe_transport.c +++ b/drivers/scsi/fcoe/fcoe_transport.c @@ -32,13 +32,13 @@ MODULE_AUTHOR("Open-FCoE.org"); MODULE_DESCRIPTION("FIP discovery protocol and FCoE transport for FCoE HBAs"); MODULE_LICENSE("GPL v2"); -static int fcoe_transport_create(const char *, struct kernel_param *); -static int fcoe_transport_destroy(const char *, struct kernel_param *); +static int fcoe_transport_create(const char *, const struct kernel_param *); +static int fcoe_transport_destroy(const char *, const struct kernel_param *); static int fcoe_transport_show(char *buffer, const struct kernel_param *kp); static struct fcoe_transport *fcoe_transport_lookup(struct net_device *device); static struct fcoe_transport *fcoe_netdev_map_lookup(struct net_device *device); -static int fcoe_transport_enable(const char *, struct kernel_param *); -static int fcoe_transport_disable(const char *, struct kernel_param *); +static int fcoe_transport_enable(const char *, const struct kernel_param *); +static int fcoe_transport_disable(const char *, const struct kernel_param *); static int libfcoe_device_notification(struct notifier_block *notifier, ulong event, void *ptr); @@ -842,7 +842,8 @@ EXPORT_SYMBOL(fcoe_ctlr_destroy_store); * * Returns: 0 for success */ -static int fcoe_transport_create(const char *buffer, struct kernel_param *kp) +static int fcoe_transport_create(const char *buffer, + const struct kernel_param *kp) { int rc = -ENODEV; struct net_device *netdev = NULL; @@ -907,7 +908,8 @@ out_nodev: * * Returns: 0 for success */ -static int fcoe_transport_destroy(const char *buffer, struct kernel_param *kp) +static int fcoe_transport_destroy(const char *buffer, + const struct kernel_param *kp) { int rc = -ENODEV; struct net_device *netdev = NULL; @@ -951,7 +953,8 @@ out_nodev: * * Returns: 0 for success */ -static int fcoe_transport_disable(const char *buffer, struct kernel_param *kp) +static int fcoe_transport_disable(const char *buffer, + const struct kernel_param *kp) { int rc = -ENODEV; struct net_device *netdev = NULL; @@ -985,7 +988,8 @@ out_nodev: * * Returns: 0 for success */ -static int fcoe_transport_enable(const char *buffer, struct kernel_param *kp) +static int fcoe_transport_enable(const char *buffer, + const struct kernel_param *kp) { int rc = -ENODEV; struct net_device *netdev = NULL; diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 8d52afd1f71d..c8430f428e14 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -100,7 +100,7 @@ _base_get_ioc_facts(struct MPT3SAS_ADAPTER *ioc, int sleep_flag); * */ static int -_scsih_set_fwfault_debug(const char *val, struct kernel_param *kp) +_scsih_set_fwfault_debug(const char *val, const struct kernel_param *kp) { int ret = param_set_int(val, kp); struct MPT3SAS_ADAPTER *ioc; diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 0e39bb1489ac..f02fc3e504c0 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -283,7 +283,7 @@ struct _scsi_io_transfer { * Note: The logging levels are defined in mpt3sas_debug.h. */ static int -_scsih_set_debug_level(const char *val, struct kernel_param *kp) +_scsih_set_debug_level(const char *val, const struct kernel_param *kp) { int ret = param_set_int(val, kp); struct MPT3SAS_ADAPTER *ioc; diff --git a/drivers/soc/qcom/hab/hab.h b/drivers/soc/qcom/hab/hab.h index cb7ed52050b2..bdc8f54b08b4 100644 --- a/drivers/soc/qcom/hab/hab.h +++ b/drivers/soc/qcom/hab/hab.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016-2019, The Linux Foundation. All rights reserved. +/* Copyright (c) 2016-2019, 2021, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -98,6 +98,12 @@ enum hab_payload_type { /* make sure concascaded name is less than this value */ #define MAX_VMID_NAME_SIZE 30 +/* + * The maximum value of payload_count in struct export_desc + * Max u32_t size_bytes from hab_ioctl.h(0xFFFFFFFF) / page size(0x1000) + */ +#define MAX_EXP_PAYLOAD_COUNT 0xFFFFF + #define HABCFG_FILE_SIZE_MAX 256 #define HABCFG_MMID_AREA_MAX (MM_ID_MAX/100) diff --git a/drivers/soc/qcom/hab/hab_msg.c b/drivers/soc/qcom/hab/hab_msg.c index 9f92074665df..4494cfbf0e0b 100644 --- a/drivers/soc/qcom/hab/hab_msg.c +++ b/drivers/soc/qcom/hab/hab_msg.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2016-2019, The Linux Foundation. All rights reserved. +/* Copyright (c) 2016-2019, 2021, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -11,6 +11,7 @@ * */ #include "hab.h" +#include "hab_grantable.h" static int hab_rx_queue_empty(struct virtual_channel *vchan) { @@ -207,6 +208,8 @@ int hab_msg_recv(struct physical_channel *pchan, struct export_desc *exp_desc = NULL, exp_ack = {0}; struct timeval tv = {0}; unsigned long long rx_mpm_tv = 0; + size_t exp_desc_size_expected = 0; + struct compressed_pfns *pfn_table = NULL; /* get the local virtual channel if it isn't an open message */ if (payload_type != HAB_PAYLOAD_TYPE_INIT && @@ -295,8 +298,11 @@ int hab_msg_recv(struct physical_channel *pchan, break; case HAB_PAYLOAD_TYPE_EXPORT: - if (sizebytes > HAB_HEADER_SIZE_MASK) { - pr_err("%s exp size too large %zd header %zd\n", + exp_desc_size_expected = sizeof(struct export_desc) + + sizeof(struct compressed_pfns); + if (sizebytes > (size_t)HAB_HEADER_SIZE_MASK) || + sizebytes < exp_desc_size_expected) { + pr_err("%s exp size too large/small %zu header %zu\n", pchan->name, sizebytes, sizeof(*exp_desc)); break; } @@ -328,6 +334,53 @@ int hab_msg_recv(struct physical_channel *pchan, hab_export_enqueue(vchan, exp_desc); /* for local use */ hab_send_export_ack(vchan, pchan, &exp_ack); /* ack exporter */ + /* + * We should do all the checks here. + * But in order to improve performance, we put the + * checks related to exp->payload_count and + * pfn_table->region[i].size into function pages_list_create. + * So any potential usage of such data from the remote side + * after the checks here and before the checks in + * pages_list_create needs to add some more checks if necessary. + */ + pfn_table = (struct compressed_pfns *)exp_desc->payload; + if (pfn_table->nregions <= 0 || + (pfn_table->nregions > + SIZE_MAX / sizeof(struct region)) || + (SIZE_MAX - exp_desc_size_expected < + pfn_table->nregions * sizeof(struct region))) { + pr_err("%s nregions is too large or negative, nregions:%d!\n", + pchan->name, pfn_table->nregions); + kfree(exp_desc); + break; + } + + if (pfn_table->nregions > exp_desc->payload_count) { + pr_err("%s nregions %d greater than payload_count %d\n", + pchan->name, pfn_table->nregions, + exp_desc->payload_count); + kfree(exp_desc); + break; + } + + if (exp_desc->payload_count > MAX_EXP_PAYLOAD_COUNT) { + pr_err("payload_count out of range: %d size overflow\n", + exp_desc->payload_count); + kfree(exp_desc); + break; + } + + exp_desc_size_expected += + pfn_table->nregions * sizeof(struct region); + if (sizebytes != exp_desc_size_expected) { + pr_err("%s exp size not equal %zu expect %zu\n", + pchan->name, sizebytes, exp_desc_size_expected); + kfree(exp_desc); + break; + } + + hab_export_enqueue(vchan, exp_desc); + hab_send_export_ack(vchan, pchan, exp_desc); break; case HAB_PAYLOAD_TYPE_EXPORT_ACK: diff --git a/drivers/soc/qcom/hab/khab_test.c b/drivers/soc/qcom/hab/khab_test.c index 6add7af0489f..6a89afb3d726 100644 --- a/drivers/soc/qcom/hab/khab_test.c +++ b/drivers/soc/qcom/hab/khab_test.c @@ -241,13 +241,13 @@ int hab_perf_test(long testId) return ret; } -static int kick_hab_perf_test(const char *val, struct kernel_param *kp); -static int get_hab_perf_result(char *buffer, struct kernel_param *kp); +static int kick_hab_perf_test(const char *val, const struct kernel_param *kp); +static int get_hab_perf_result(char *buffer, const struct kernel_param *kp); module_param_call(perf_test, kick_hab_perf_test, get_hab_perf_result, NULL, S_IRUSR | S_IWUSR); -static int kick_hab_perf_test(const char *val, struct kernel_param *kp) +static int kick_hab_perf_test(const char *val, const struct kernel_param *kp) { long testId; int err = kstrtol(val, 10, &testId); @@ -258,7 +258,7 @@ static int kick_hab_perf_test(const char *val, struct kernel_param *kp) return hab_perf_test(testId); } -static int get_hab_perf_result(char *buffer, struct kernel_param *kp) +static int get_hab_perf_result(char *buffer, const struct kernel_param *kp) { return strlcpy(buffer, g_perf_test_result, strlen(g_perf_test_result)+1); diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index 9f46e0f56b1a..1cda5df5843b 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -1665,10 +1665,10 @@ static long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) data.allocation.flags, true); if (IS_ERR(handle)) return PTR_ERR(handle); - pass_to_user(handle); data.allocation.handle = handle->id; cleanup_handle = handle; + pass_to_user(handle); break; } case ION_IOC_FREE: @@ -1713,11 +1713,12 @@ static long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (IS_ERR(handle)) { ret = PTR_ERR(handle); } else { + data.handle.handle = handle->id; handle = pass_to_user(handle); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { ret = PTR_ERR(handle); - else - data.handle.handle = handle->id; + data.handle.handle = 0; + } } mutex_unlock(&client->lock); break; diff --git a/drivers/staging/android/ion/msm/msm_ion.c b/drivers/staging/android/ion/msm/msm_ion.c index 1a3aca407c9e..0a61301385f2 100644 --- a/drivers/staging/android/ion/msm/msm_ion.c +++ b/drivers/staging/android/ion/msm/msm_ion.c @@ -628,6 +628,9 @@ static int check_vaddr_bounds(unsigned long start, unsigned long end) struct vm_area_struct *vma; int ret = 1; + if (!start) + goto out; + if (end < start) goto out; @@ -781,20 +784,28 @@ long msm_ion_custom_ioctl(struct ion_client *client, down_read(&mm->mmap_sem); - start = (unsigned long)data.flush_data.vaddr + - data.flush_data.offset; - end = start + data.flush_data.length; - - if (start && check_vaddr_bounds(start, end)) { - pr_err("%s: virtual address %pK is out of bounds\n", + if ((unsigned long)data.flush_data.vaddr > + (ULONG_MAX - data.flush_data.offset)) { + pr_err("%s: Integer overflow detected for %pK\n", __func__, data.flush_data.vaddr); ret = -EINVAL; } else { - ret = ion_do_cache_op( - client, handle, data.flush_data.vaddr, - data.flush_data.offset, - data.flush_data.length, cmd); + start = (unsigned long)data.flush_data.vaddr + + data.flush_data.offset; + end = start + data.flush_data.length; + + if (check_vaddr_bounds(start, end)) { + pr_err("%s: virtual address %pK is out of bounds\n", + __func__, data.flush_data.vaddr); + ret = -EINVAL; + } else { + ret = ion_do_cache_op( + client, handle, data.flush_data.vaddr, + data.flush_data.offset, + data.flush_data.length, cmd); + } } + up_read(&mm->mmap_sem); ion_free_nolock(client, handle); diff --git a/drivers/staging/rdma/ipath/ipath_init_chip.c b/drivers/staging/rdma/ipath/ipath_init_chip.c index a5eea199f733..f8338f0e9d3f 100644 --- a/drivers/staging/rdma/ipath/ipath_init_chip.c +++ b/drivers/staging/rdma/ipath/ipath_init_chip.c @@ -63,7 +63,7 @@ MODULE_PARM_DESC(cfgports, "Set max number of ports to use"); */ static ushort ipath_kpiobufs; -static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp); +static int ipath_set_kpiobufs(const char *str, const struct kernel_param *kp); module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort, &ipath_kpiobufs, S_IWUSR | S_IRUGO); @@ -1014,7 +1014,7 @@ done: return ret; } -static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp) +static int ipath_set_kpiobufs(const char *str, const struct kernel_param *kp) { struct ipath_devdata *dd; unsigned long flags; diff --git a/drivers/tty/serial/kgdboc.c b/drivers/tty/serial/kgdboc.c index 0314e78e31ff..ec24e82da169 100644 --- a/drivers/tty/serial/kgdboc.c +++ b/drivers/tty/serial/kgdboc.c @@ -232,7 +232,8 @@ static void kgdboc_put_char(u8 chr) kgdb_tty_line, chr); } -static int param_set_kgdboc_var(const char *kmessage, struct kernel_param *kp) +static int param_set_kgdboc_var(const char *kmessage, + const struct kernel_param *kp) { size_t len = strlen(kmessage); diff --git a/drivers/usb/chipidea/otg_fsm.h b/drivers/usb/chipidea/otg_fsm.h index 2689375ae5da..262d6ef8df7c 100644 --- a/drivers/usb/chipidea/otg_fsm.h +++ b/drivers/usb/chipidea/otg_fsm.h @@ -62,7 +62,7 @@ /* SSEND time before SRP */ #define TB_SSEND_SRP (1500) /* minimum 1.5 sec, section:5.1.2 */ -#ifdef CONFIG_USB_OTG_FSM +#if IS_ENABLED(CONFIG_USB_OTG_FSM) int ci_hdrc_otg_fsm_init(struct ci_hdrc *ci); int ci_otg_fsm_work(struct ci_hdrc *ci); diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 0a6810ea4107..61c77107e97d 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -1942,6 +1942,9 @@ unknown: if (w_index != 0x5 || (w_value >> 8)) break; interface = w_value & 0xFF; + if (interface >= MAX_CONFIG_INTERFACES || + !os_desc_cfg->interface[interface]) + break; buf[6] = w_index; if (w_length == 0x0A) { count = count_ext_prop(os_desc_cfg, diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 9ba61939eed6..2c352d21c0cc 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -16,7 +16,7 @@ #include <linux/usb/ch9.h> #ifdef CONFIG_USB_CONFIGFS_F_ACC -extern int acc_ctrlrequest(struct usb_composite_dev *cdev, +extern int acc_ctrlrequest_composite(struct usb_composite_dev *cdev, const struct usb_ctrlrequest *ctrl); void acc_disconnect(void); #endif @@ -1620,7 +1620,7 @@ static int android_setup(struct usb_gadget *gadget, #ifdef CONFIG_USB_CONFIGFS_F_ACC if (value < 0) - value = acc_ctrlrequest(cdev, c); + value = acc_ctrlrequest_composite(cdev, c); #endif if (value < 0) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index b5c1ad06f8be..0bfd486a4414 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -1066,6 +1066,26 @@ err: } EXPORT_SYMBOL_GPL(acc_ctrlrequest); +int acc_ctrlrequest_composite(struct usb_composite_dev *cdev, + const struct usb_ctrlrequest *ctrl) +{ + u16 w_length = le16_to_cpu(ctrl->wLength); + + if (w_length > USB_COMP_EP0_BUFSIZ) { + if (ctrl->bRequestType & USB_DIR_IN) { + /* Cast away the const, we are going to overwrite on purpose. */ + __le16 *temp = (__le16 *)&ctrl->wLength; + + *temp = cpu_to_le16(USB_COMP_EP0_BUFSIZ); + w_length = USB_COMP_EP0_BUFSIZ; + } else { + return -EINVAL; + } + } + return acc_ctrlrequest(cdev, ctrl); +} +EXPORT_SYMBOL_GPL(acc_ctrlrequest_composite); + static int __acc_function_bind(struct usb_configuration *c, struct usb_function *f, bool configfs) diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c index 1d13d79d5070..3046066680d9 100644 --- a/drivers/usb/gadget/function/rndis.c +++ b/drivers/usb/gadget/function/rndis.c @@ -650,14 +650,18 @@ static int rndis_set_response(struct rndis_params *params, rndis_set_cmplt_type *resp; rndis_resp_t *r; + BufLength = le32_to_cpu(buf->InformationBufferLength); + BufOffset = le32_to_cpu(buf->InformationBufferOffset); + if ((BufLength > RNDIS_MAX_TOTAL_SIZE) || + (BufOffset > RNDIS_MAX_TOTAL_SIZE) || + (BufOffset + 8 >= RNDIS_MAX_TOTAL_SIZE)) + return -EINVAL; + r = rndis_add_response(params, sizeof(rndis_set_cmplt_type)); if (!r) return -ENOMEM; resp = (rndis_set_cmplt_type *)r->buf; - BufLength = le32_to_cpu(buf->InformationBufferLength); - BufOffset = le32_to_cpu(buf->InformationBufferOffset); - #ifdef VERBOSE_DEBUG pr_debug("%s: Length: %d\n", __func__, BufLength); pr_debug("%s: Offset: %d\n", __func__, BufOffset); diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index f2b4fdd1f49d..a069150d97f3 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -1851,8 +1851,9 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) spin_lock_irq (&dev->lock); value = -EINVAL; if (dev->buf) { + spin_unlock_irq(&dev->lock); kfree(kbuf); - goto fail; + return value; } dev->buf = kbuf; @@ -1900,8 +1901,8 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) value = usb_gadget_probe_driver(&gadgetfs_driver); if (value != 0) { - kfree (dev->buf); - dev->buf = NULL; + spin_lock_irq(&dev->lock); + goto fail; } else { /* at this point "good" hardware has for the first time * let the USB the host see us. alternatively, if users @@ -1917,6 +1918,9 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) return value; fail: + dev->config = NULL; + dev->hs_config = NULL; + dev->dev = NULL; spin_unlock_irq (&dev->lock); pr_debug ("%s: %s fail %Zd, %p\n", shortname, __func__, value, dev); kfree (dev->buf); diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c index 2938153fe7b1..9b1963a91ad5 100644 --- a/drivers/usb/serial/console.c +++ b/drivers/usb/serial/console.c @@ -269,7 +269,7 @@ static struct console usbcons = { void usb_serial_console_disconnect(struct usb_serial *serial) { - if (serial && serial->port && serial->port[0] + if (serial && serial->port[0] && serial->port[0] == usbcons_info.port) { usb_serial_console_exit(); usb_serial_put(serial); diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 850d86ca685b..f08809323762 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -24,7 +24,6 @@ menuconfig VFIO select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM_SMMU || ARM_SMMU_V3) select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES) - select ANON_INODES help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. diff --git a/drivers/video/console/dummycon.c b/drivers/video/console/dummycon.c index b30e7d87804b..04e75d5efb19 100644 --- a/drivers/video/console/dummycon.c +++ b/drivers/video/console/dummycon.c @@ -41,12 +41,69 @@ static void dummycon_init(struct vc_data *vc, int init) vc_resize(vc, DUMMY_COLUMNS, DUMMY_ROWS); } -static int dummycon_dummy(void) +static void dummycon_deinit(struct vc_data *vc) +{ +} + +static void dummycon_clear(struct vc_data *vc, int a, int b, int c, int d) +{ +} + +static void dummycon_putc(struct vc_data *vc, int a, int b, int c) +{ +} + +static void dummycon_putcs(struct vc_data *vc, const unsigned short *s, int a, int b, int c) +{ +} + +static void dummycon_cursor(struct vc_data *vc, int a) +{ +} + +static int dummycon_scroll(struct vc_data *vc, int a, int b, int c, int d) { return 0; } -#define DUMMY (void *)dummycon_dummy +static int dummycon_switch(struct vc_data *vc) +{ + return 0; +} + +static int dummycon_blank(struct vc_data *vc, int a, int b) +{ + return 0; +} + +static int dummycon_font_set(struct vc_data *vc, struct console_font *f, unsigned u) +{ + return 0; +} + +static int dummycon_font_default(struct vc_data *vc, struct console_font *f, char *c) +{ + return 0; +} + +static int dummycon_font_copy(struct vc_data *vc, int a) +{ + return 0; +} + +static void con_bmove(struct vc_data *vc, int a, int b, int c, int d, int e, intf) +{ +} + +static int con_set_palette(struct vc_data *vc, unsigned char *p) +{ + return 0; +} + +static int con_scrolldelta(struct vc_data *vc, int x) +{ + return 0; +} /* * The console `switch' structure for the dummy console @@ -58,19 +115,19 @@ const struct consw dummy_con = { .owner = THIS_MODULE, .con_startup = dummycon_startup, .con_init = dummycon_init, - .con_deinit = DUMMY, - .con_clear = DUMMY, - .con_putc = DUMMY, - .con_putcs = DUMMY, - .con_cursor = DUMMY, - .con_scroll = DUMMY, .con_bmove = DUMMY, - .con_switch = DUMMY, - .con_blank = DUMMY, - .con_font_set = DUMMY, - .con_font_default = DUMMY, - .con_font_copy = DUMMY, .con_set_palette = DUMMY, .con_scrolldelta = DUMMY, + .con_deinit = dummycon_deinit, + .con_clear = dummycon_clear, + .con_putc = dummycon_putc, + .con_putcs = dummycon_putcs, + .con_cursor = dummycon_cursor, + .con_scroll = dummycon_scroll, + .con_switch = dummycon_switch, + .con_blank = dummycon_blank, + .con_font_set = dummycon_font_set, + .con_font_default = dummycon_font_default, + .con_font_copy = dummycon_font_copy, }; EXPORT_SYMBOL_GPL(dummy_con); diff --git a/drivers/video/fbdev/msm/mdss_dsi_status.c b/drivers/video/fbdev/msm/mdss_dsi_status.c index 0490bcdfbfad..ef07291d4d9a 100644 --- a/drivers/video/fbdev/msm/mdss_dsi_status.c +++ b/drivers/video/fbdev/msm/mdss_dsi_status.c @@ -217,7 +217,8 @@ static int fb_event_callback(struct notifier_block *self, return 0; } -static int param_dsi_status_disable(const char *val, struct kernel_param *kp) +static int param_dsi_status_disable(const char *val, + const struct kernel_param *kp) { int ret = 0; int int_val; @@ -232,7 +233,7 @@ static int param_dsi_status_disable(const char *val, struct kernel_param *kp) return ret; } -static int param_set_interval(const char *val, struct kernel_param *kp) +static int param_set_interval(const char *val, const struct kernel_param *kp) { int ret = 0; int int_val; diff --git a/fs/Makefile b/fs/Makefile index debf0742b3c9..969caba6b282 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -23,7 +23,7 @@ obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o -obj-$(CONFIG_ANON_INODES) += anon_inodes.o +obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o diff --git a/fs/afs/file.c b/fs/afs/file.c index cf8a07e282a6..5290f6e83605 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -123,11 +123,10 @@ static void afs_file_readpage_read_complete(struct page *page, /* * read page from file, directory or symlink, given a key to use */ -int afs_page_filler(void *data, struct page *page) +static int __afs_page_filler(struct key *key, struct page *page) { struct inode *inode = page->mapping->host; struct afs_vnode *vnode = AFS_FS_I(inode); - struct key *key = data; size_t len; off_t offset; int ret; @@ -209,6 +208,13 @@ error: return ret; } +int afs_page_filler(struct file *data, struct page *page) +{ + struct key *key = (struct key *)data; + + return __afs_page_filler(key, page); +} + /* * read page from file, directory or symlink, given a file to nominate the key * to be used @@ -221,14 +227,14 @@ static int afs_readpage(struct file *file, struct page *page) if (file) { key = file->private_data; ASSERT(key != NULL); - ret = afs_page_filler(key, page); + ret = __afs_page_filler(key, page); } else { struct inode *inode = page->mapping->host; key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); } else { - ret = afs_page_filler(key, page); + ret = __afs_page_filler(key, page); key_put(key); } } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1330b2a695ff..64452ba25988 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -499,7 +499,7 @@ extern const struct file_operations afs_file_operations; extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); -extern int afs_page_filler(void *, struct page *); +extern int afs_page_filler(struct file *, struct page *); /* * flock.c diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index ccd0b212e82a..05ba277e6269 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) /* try and do the mount */ _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); + mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); _debug("--- mount result %p ---", mnt); free_page((unsigned long) devname); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 98198c57370b..acd43792ef82 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -439,8 +439,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_uid(); - wq->gid = current_gid(); + wq->uid = current_cred()->uid; + wq->gid = current_cred()->gid; wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 1ea643faf04b..d8afb402f966 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -241,7 +241,8 @@ compose_mount_options_err: * @fullpath: full path in UNC format * @ref: server's referral */ -static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, +static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, + struct cifs_sb_info *cifs_sb, const char *fullpath, const struct dfs_info3_param *ref) { struct vfsmount *mnt; @@ -255,7 +256,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, if (IS_ERR(mountdata)) return (struct vfsmount *)mountdata; - mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); + mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata); kfree(mountdata); kfree(devname); return mnt; @@ -330,7 +331,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) mnt = ERR_PTR(-EINVAL); break; } - mnt = cifs_dfs_do_refmount(cifs_sb, + mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, referrals + i); cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__, referrals[i].node_name, mnt); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 3530e1c3ff56..d7111b8ce36a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -186,9 +186,9 @@ static const struct super_operations debugfs_super_operations = { static struct vfsmount *debugfs_automount(struct path *path) { - struct vfsmount *(*f)(void *); - f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; - return f(d_inode(path->dentry)->i_private); + debugfs_automount_t f; + f = (debugfs_automount_t)path->dentry->d_fsdata; + return f(path->dentry, d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { @@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, - struct vfsmount *(*f)(void *), + debugfs_automount_t f, void *data) { struct dentry *dentry = start_creating(name, parent); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 60f03b78914e..9323b9a8bc72 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -377,9 +377,8 @@ err: * and will start a new collection. Eventually caller must submit the last * segment if present. */ -static int readpage_strip(void *data, struct page *page) +static int __readpage_strip(struct page_collect *pcol, struct page *page) { - struct page_collect *pcol = data; struct inode *inode = pcol->inode; struct exofs_i_info *oi = exofs_i(inode); loff_t i_size = i_size_read(inode); @@ -470,6 +469,13 @@ fail: return ret; } +static int readpage_strip(struct file *data, struct page *page) +{ + struct page_collect *pcol = (struct page_collect *)data; + + return __readpage_strip(pcol, page); +} + static int exofs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { @@ -499,7 +505,7 @@ static int _readpage(struct page *page, bool read_4_write) _pcol_init(&pcol, 1, page->mapping->host); pcol.read_4_write = read_4_write; - ret = readpage_strip(&pcol, page); + ret = __readpage_strip(&pcol, page); if (ret) { EXOFS_ERR("_readpage => %d\n", ret); return ret; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 969beec3ff7e..87ab6150343b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -379,14 +379,15 @@ struct flex_groups { #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ - EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) @@ -1028,6 +1029,7 @@ struct ext4_inode_info { /* Encryption params */ struct ext4_crypt_info *i_crypt_info; #endif + kprojid_t i_projid; }; /* @@ -1070,9 +1072,15 @@ struct ext4_inode_info { #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ -#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ -#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ @@ -1283,7 +1291,7 @@ struct ext4_super_block { #endif /* Number of quota types we support */ -#define EXT4_MAXQUOTAS 2 +#define EXT4_MAXQUOTAS 3 /* * fourth extended-fs super-block data in memory @@ -1801,7 +1809,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ - EXT4_FEATURE_RO_COMPAT_QUOTA) + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -1843,6 +1852,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + #define EXT4_DEF_INODE_READAHEAD_BLKS 32 /* @@ -2565,6 +2579,7 @@ extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 685a26e9540f..ea26a0dcb87b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -781,6 +781,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, inode->i_gid = dir->i_gid; } else inode_init_owner(inode, dir, mode); + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) + ei->i_projid = EXT4_I(dir)->i_projid; + else + ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4fbede7d7e2f..b65680c5404b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4300,6 +4300,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode, EXT4_I(inode)->i_inline_off = 0; } +int ext4_get_projid(struct inode *inode, kprojid_t *projid) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) + return -EOPNOTSUPP; + *projid = EXT4_I(inode)->i_projid; + return 0; +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -4314,6 +4322,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, int block; uid_t i_uid; gid_t i_gid; + projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || @@ -4389,12 +4398,20 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); + else + i_projid = EXT4_DEF_PROJID; + if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); + ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -4706,6 +4723,7 @@ static int ext4_do_update_inode(handle_t *handle, int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; + projid_t i_projid; spin_lock(&ei->i_raw_lock); @@ -4718,6 +4736,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); + i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); @@ -4795,6 +4814,15 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le16(ei->i_extra_isize); } } + + BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_PROJECT) && + i_projid != EXT4_DEF_PROJID); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + raw_inode->i_projid = cpu_to_le32(i_projid); + ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); if (inode->i_sb->s_flags & MS_LAZYTIME) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 313a61626d2d..9c6fe28fe27e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3218,6 +3218,12 @@ static int ext4_link(struct dentry *old_dentry, if (ext4_encrypted_inode(dir) && !ext4_is_child_context_consistent_with_parent(dir, inode)) return -EXDEV; + + if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -3527,6 +3533,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, !ext4_has_encryption_key(new_dir))) return -ENOKEY; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3746,6 +3757,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.inode))) return -EXDEV; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid)) || + (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(old_dir)->i_projid, + EXT4_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6d3b72c959c8..9f2ca5e2531a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1081,8 +1081,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, } #ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") -#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) +static char *quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); @@ -1115,6 +1115,7 @@ static const struct dquot_operations ext4_quota_operations = { .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -1170,7 +1171,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, @@ -1230,6 +1231,7 @@ static const match_table_t tokens = { {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, + {Opt_prjquota, "prjquota"}, {Opt_barrier, "barrier=%u"}, {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, @@ -1449,8 +1451,11 @@ static const struct mount_opts { MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, + {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, + MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | - EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), + MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, {Opt_offusrjquota, 0, MOPT_Q}, @@ -1735,13 +1740,17 @@ static int parse_options(char *options, struct super_block *sb, return 0; } #ifdef CONFIG_QUOTA - if (ext4_has_feature_quota(sb) && - (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { - ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota " - "mount options ignored."); - clear_opt(sb, USRQUOTA); - clear_opt(sb, GRPQUOTA); - } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) { + ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return 0; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -2635,6 +2644,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) "without CONFIG_QUOTA"); return 0; } + if (ext4_has_feature_project(sb) && !readonly) { + ext4_msg(sb, KERN_ERR, + "Filesystem with project quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return 0; + } #endif /* CONFIG_QUOTA */ return 1; } @@ -3871,7 +3886,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -5020,6 +5035,48 @@ restore_opts: return err; } +#ifdef CONFIG_QUOTA +static int ext4_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -5052,6 +5109,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; +#ifdef CONFIG_QUOTA + if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) + ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); +#endif return 0; } @@ -5239,7 +5301,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); @@ -5270,14 +5333,21 @@ static int ext4_enable_quotas(struct super_block *sb) int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) + }; + bool quota_mopt[EXT4_MAXQUOTAS] = { + test_opt(sb, USRQUOTA), + test_opt(sb, GRPQUOTA), + test_opt(sb, PRJQUOTA), }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, - DQUOT_USAGE_ENABLED); + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { for (type--; type >= 0; type--) dquot_quota_off(sb, type); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9fc6c2597dcc..1907299d4296 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -875,9 +875,9 @@ struct fuse_fill_data { unsigned nr_pages; }; -static int fuse_readpages_fill(void *_data, struct page *page) +static int fuse_readpages_fill(struct file *_data, struct page *page) { - struct fuse_fill_data *data = _data; + struct fuse_fill_data *data = (struct fuse_fill_data *)_data; struct fuse_req *req = data->req; struct inode *inode = data->inode; struct fuse_conn *fc = get_fuse_conn(inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ca9c492a1885..adecbb7a13fe 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -29,7 +29,7 @@ static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); -static int set_global_limit(const char *val, struct kernel_param *kp); +static int set_global_limit(const char *val, const struct kernel_param *kp); unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, @@ -811,7 +811,7 @@ static void sanitize_global_limit(unsigned *limit) *limit = (1 << 16) - 1; } -static int set_global_limit(const char *val, struct kernel_param *kp) +static int set_global_limit(const char *val, const struct kernel_param *kp) { int rv; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 582ef53f2104..6e7b6cb3f6cd 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -476,7 +476,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) * */ -static int __gfs2_readpage(void *file, struct page *page) +static int __gfs2_readpage(struct file *file, struct page *page) { struct gfs2_inode *ip = GFS2_I(page->mapping->host); struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 21e719199a84..7fb45ee36897 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -740,10 +740,15 @@ err_out: static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { - if (!(kn->flags & KERNFS_HAS_RELEASE)) - return; + /* + * @of is guaranteed to have no other file operations in flight and + * we just want to synchronize release and drain paths. + * @kernfs_open_file_mutex is enough. @of->mutex can't be used + * here because drain path may be called from places which can + * cause circular dependency. + */ + lockdep_assert_held(&kernfs_open_file_mutex); - mutex_lock(&of->mutex); if (!of->released) { /* * A file is never detached without being released and we @@ -753,7 +758,6 @@ static void kernfs_release_file(struct kernfs_node *kn, kn->attr.ops->release(of); of->released = true; } - mutex_unlock(&of->mutex); } static int kernfs_fop_release(struct inode *inode, struct file *filp) @@ -761,7 +765,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; struct kernfs_open_file *of = kernfs_of(filp); - kernfs_release_file(kn, of); + if (kn->flags & KERNFS_HAS_RELEASE) { + mutex_lock(&kernfs_open_file_mutex); + kernfs_release_file(kn, of); + mutex_unlock(&kernfs_open_file_mutex); + } + kernfs_put_open_node(kn, of); seq_release(inode, filp); kfree(of->prealloc_buf); @@ -794,7 +803,8 @@ void kernfs_drain_open_files(struct kernfs_node *kn) if (kn->flags & KERNFS_HAS_MMAP) unmap_mapping_range(inode->i_mapping, 0, 0, 1); - kernfs_release_file(kn, of); + if (kn->flags & KERNFS_HAS_RELEASE) + kernfs_release_file(kn, of); } mutex_unlock(&kernfs_open_file_mutex); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index ef34e31a9d57..a71e996d6c55 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -50,7 +50,8 @@ static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) if (scops && scops->show_path) return scops->show_path(sf, node, root); - return seq_dentry(sf, dentry, " \t\n\\"); + seq_dentry(sf, dentry, " \t\n\\"); + return 0; } const struct super_operations kernfs_sops = { diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index f038d4ac9aec..7890779f44b5 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -526,7 +526,7 @@ static struct ctl_table nlm_sysctl_root[] = { */ #define param_set_min_max(name, type, which_strtol, min, max) \ -static int param_set_##name(const char *val, struct kernel_param *kp) \ +static int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ char *endp; \ __typeof__(type) num = which_strtol(val, &endp, 0); \ diff --git a/fs/locks.c b/fs/locks.c index 2c8e1e429cf7..4faeb3f6d3dc 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -230,6 +230,7 @@ locks_get_lock_context(struct inode *inode, int type) ctx = smp_load_acquire(&inode->i_flctx); } out: + trace_locks_get_lock_context(inode, type, ctx); return ctx; } @@ -934,7 +935,8 @@ out: return error; } -static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) +static int posix_lock_inode(struct inode *inode, struct file_lock *request, + struct file_lock *conflock) { struct file_lock *fl, *tmp; struct file_lock *new_fl = NULL; @@ -1142,6 +1144,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (new_fl2) locks_free_lock(new_fl2); locks_dispose_list(&dispose); + trace_posix_lock_inode(inode, request, error); + return error; } @@ -1162,7 +1166,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return __posix_lock_file(file_inode(filp), fl, conflock); + return posix_lock_inode(file_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1178,7 +1182,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) int error; might_sleep (); for (;;) { - error = __posix_lock_file(inode, fl, NULL); + error = posix_lock_inode(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1260,7 +1264,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, if (filp) { fl.fl_owner = filp; fl.fl_flags &= ~FL_SLEEP; - error = __posix_lock_file(inode, &fl, NULL); + error = posix_lock_inode(inode, &fl, NULL); if (!error) break; } @@ -1268,7 +1272,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, if (sleep) fl.fl_flags |= FL_SLEEP; fl.fl_owner = current->files; - error = __posix_lock_file(inode, &fl, NULL); + error = posix_lock_inode(inode, &fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); @@ -2165,6 +2169,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; + inode = file_inode(filp); + /* * This might block, so we do it before checking the inode. */ @@ -2172,8 +2178,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = file_inode(filp); - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2242,6 +2246,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, } } out: + trace_fcntl_setlk(inode, file_lock, error); locks_free_lock(file_lock); return error; } @@ -2398,6 +2403,7 @@ out: */ void locks_remove_posix(struct file *filp, fl_owner_t owner) { + int error; struct file_lock lock; struct file_lock_context *ctx; @@ -2420,10 +2426,11 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) lock.fl_ops = NULL; lock.fl_lmops = NULL; - vfs_lock_file(filp, F_SETLK, &lock, NULL); + error = vfs_lock_file(filp, F_SETLK, &lock, NULL); if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock); + trace_locks_remove_posix(file_inode(filp), &lock, error); } EXPORT_SYMBOL(locks_remove_posix); diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index a709d80c8ebc..9a202f7ed755 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -33,9 +33,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) return submit_bio_wait(rw, &bio); } -static int bdev_readpage(void *_sb, struct page *page) +static int bdev_readpage(struct file *_sb, struct page *page) { - struct super_block *sb = _sb; + struct super_block *sb = (struct super_block *)_sb; struct block_device *bdev = logfs_super(sb)->s_bdev; int err; diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index 9c501449450d..4ae7d17f96e3 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -122,9 +122,9 @@ static void logfs_mtd_sync(struct super_block *sb) mtd_sync(mtd); } -static int logfs_mtd_readpage(void *_sb, struct page *page) +static int logfs_mtd_readpage(struct file *_sb, struct page *page) { - struct super_block *sb = _sb; + struct super_block *sb = (struct super_block *)_sb; int err; err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index f9b45d46d4c4..48085ab8bcd5 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -174,7 +174,7 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) if (!logfs_exist_block(dir, index)) continue; page = read_cache_page(dir->i_mapping, index, - (filler_t *)logfs_readpage, NULL); + logfs_readpage, NULL); if (IS_ERR(page)) return page; dd = kmap_atomic(page); @@ -306,7 +306,7 @@ static int logfs_readdir(struct file *file, struct dir_context *ctx) continue; } page = read_cache_page(dir->i_mapping, pos, - (filler_t *)logfs_readpage, NULL); + logfs_readpage, NULL); if (IS_ERR(page)) return PTR_ERR(page); dd = kmap(page); diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 5f0937609465..96322622870c 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -151,7 +151,7 @@ struct logfs_device_ops { struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs); int (*write_sb)(struct super_block *sb, struct page *page); - int (*readpage)(void *_sb, struct page *page); + int (*readpage)(struct file *_sb, struct page *page); void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); int (*erase)(struct super_block *sb, loff_t ofs, size_t len, int ensure_write); @@ -485,7 +485,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s, #endif /* dev_mtd.c */ -#ifdef CONFIG_MTD +#if IS_ENABLED(CONFIG_MTD) int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr); #else static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) diff --git a/fs/namei.c b/fs/namei.c index 8f350e39be96..d6d2d0dec826 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -35,6 +35,7 @@ #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> +#include <linux/init_task.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/fs/namespace.c b/fs/namespace.c index 2443cdb9ce86..6f9a6bee9d87 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1015,6 +1015,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void } EXPORT_SYMBOL_GPL(vfs_kern_mount); +struct vfsmount * +vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, + const char *name, void *data) +{ + /* Until it is worked out how to pass the user namespace + * through from the parent mount to the submount don't support + * unprivileged mounts with submounts. + */ + if (mountpoint->d_sb->s_user_ns != &init_user_ns) + return ERR_PTR(-EPERM); + + return vfs_kern_mount(type, MS_SUBMOUNT, name, data); +} +EXPORT_SYMBOL_GPL(vfs_submount); + static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { @@ -2545,10 +2560,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, return -ENODEV; if (user_ns != &init_user_ns) { - if (!(type->fs_flags & FS_USERNS_MOUNT)) { - put_filesystem(type); - return -EPERM; - } /* Only in special cases allow devices from mounts * created outside the initial user namespace. */ @@ -2868,7 +2879,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME); + MS_STRICTATIME | MS_SUBMOUNT); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index d29ad4e02d33..f1804be48a3a 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -230,7 +230,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, const char *devname, struct nfs_clone_mount *mountdata) { - return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); + return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata); } /** diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index f592672373cb..b2a7f72eb116 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mountdata->hostname, mountdata->mnt_path); - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata); if (!IS_ERR(mnt)) break; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 0bb580174cb3..a25985705cd5 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -343,7 +343,7 @@ struct nfs_readdesc { }; static int -readpage_async_filler(void *data, struct page *page) +readpage_async_filler(struct file *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; struct nfs_page *new; diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index e5f911bd80d2..390827fa82ef 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -1,7 +1,6 @@ config FANOTIFY bool "Filesystem wide access notification" select FSNOTIFY - select ANON_INODES default n ---help--- Say Y here to enable fanotify support. fanotify is a file access diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index b981fc0c8379..0161c74e76e2 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -1,6 +1,5 @@ config INOTIFY_USER bool "Inotify support for userspace" - select ANON_INODES select FSNOTIFY default y ---help--- diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b5cf27dcb18a..cb3703393264 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -88,13 +88,13 @@ struct workqueue_struct *user_dlm_worker; */ #define DLMFS_CAPABILITIES "bast stackglue" static int param_set_dlmfs_capabilities(const char *val, - struct kernel_param *kp) + const struct kernel_param *kp) { printk(KERN_ERR "%s: readonly parameter\n", kp->name); return -EINVAL; } static int param_get_dlmfs_capabilities(char *buffer, - struct kernel_param *kp) + const struct kernel_param *kp) { return strlcpy(buffer, DLMFS_CAPABILITIES, strlen(DLMFS_CAPABILITIES) + 1); diff --git a/fs/proc/base.c b/fs/proc/base.c index 8da1fc940a86..e16afc80f810 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3202,6 +3202,15 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = default_llseek, }; +struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + if (!d_is_dir(file->f_path.dentry) || + (file->f_op != &proc_tgid_base_operations)) + return ERR_PTR(-EBADF); + + return proc_pid(file_inode(file)); +} + static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, diff --git a/fs/read_write.c b/fs/read_write.c index 7b175b9134ec..27023e8f531e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -21,9 +21,6 @@ #include <asm/uaccess.h> #include <asm/unistd.h> -typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); -typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *); - const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, @@ -656,7 +653,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) EXPORT_SYMBOL(iov_shorten); static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, iter_fn_t fn) + loff_t *ppos, int type) { struct kiocb kiocb; ssize_t ret; @@ -664,7 +661,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - ret = fn(&kiocb, iter); + if (type == READ) + ret = filp->f_op->read_iter(&kiocb, iter); + else + ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; @@ -672,7 +672,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, io_fn_t fn) + loff_t *ppos, int type) { ssize_t ret = 0; @@ -680,7 +680,13 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; - nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos); + if (type == READ) { + nr = filp->f_op->read(filp, iovec.iov_base, + iovec.iov_len, ppos); + } else { + nr = filp->f_op->write(filp, iovec.iov_base, + iovec.iov_len, ppos); + } if (nr < 0) { if (!ret) @@ -783,8 +789,6 @@ static ssize_t do_readv_writev(int type, struct file *file, struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; - io_fn_t fn; - iter_fn_t iter_fn; ret = import_iovec(type, uvector, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); @@ -798,19 +802,14 @@ static ssize_t do_readv_writev(int type, struct file *file, if (ret < 0) goto out; - if (type == READ) { - fn = file->f_op->read; - iter_fn = file->f_op->read_iter; - } else { - fn = (io_fn_t)file->f_op->write; - iter_fn = file->f_op->write_iter; + if (type != READ) file_start_write(file); - } - if (iter_fn) - ret = do_iter_readv_writev(file, &iter, pos, iter_fn); + if ((type == READ && file->f_op->read_iter) || + (type == WRITE && file->f_op->write_iter)) + ret = do_iter_readv_writev(file, &iter, pos, type); else - ret = do_loop_readv_writev(file, &iter, pos, fn); + ret = do_loop_readv_writev(file, &iter, pos, type); if (type != READ) file_end_write(file); @@ -957,8 +956,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; - io_fn_t fn; - iter_fn_t iter_fn; ret = compat_import_iovec(type, uvector, nr_segs, UIO_FASTIOV, &iov, &iter); @@ -972,19 +969,14 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (ret < 0) goto out; - if (type == READ) { - fn = file->f_op->read; - iter_fn = file->f_op->read_iter; - } else { - fn = (io_fn_t)file->f_op->write; - iter_fn = file->f_op->write_iter; + if (type != READ) file_start_write(file); - } - if (iter_fn) - ret = do_iter_readv_writev(file, &iter, pos, iter_fn); + if ((type == READ && file->f_op->read_iter) || + (type == WRITE && file->f_op->write_iter)) + ret = do_iter_readv_writev(file, &iter, pos, type); else - ret = do_loop_readv_writev(file, &iter, pos, fn); + ret = do_loop_readv_writev(file, &iter, pos, type); if (type != READ) file_end_write(file); diff --git a/fs/super.c b/fs/super.c index 34f1dc72064d..55cd8843b49a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -476,6 +476,10 @@ struct super_block *sget_userns(struct file_system_type *type, struct super_block *old; int err; + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && + !(type->fs_flags & FS_USERNS_MOUNT) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); retry: spin_lock(&sb_lock); if (test) { @@ -502,7 +506,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags, user_ns); + s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -547,8 +551,15 @@ struct super_block *sget(struct file_system_type *type, { struct user_namespace *user_ns = current_user_ns(); + /* We don't yet pass the user namespace of the parent + * mount through to here so always use &init_user_ns + * until that changes. + */ + if (flags & MS_SUBMOUNT) + user_ns = &init_user_ns; + /* Ensure the requestor has permissions over the target filesystem */ - if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); return sget_userns(type, test, set, flags, user_ns, data); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 52ab4aa7f214..2014a1bc8f3d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -89,7 +89,7 @@ #ifdef CONFIG_FTRACE_MCOUNT_RECORD #define MCOUNT_REC() . = ALIGN(8); \ VMLINUX_SYMBOL(__start_mcount_loc) = .; \ - *(__mcount_loc) \ + KEEP(*(__mcount_loc)) \ VMLINUX_SYMBOL(__stop_mcount_loc) = .; #else #define MCOUNT_REC() @@ -123,10 +123,10 @@ #ifdef CONFIG_EVENT_TRACING #define FTRACE_EVENTS() . = ALIGN(8); \ VMLINUX_SYMBOL(__start_ftrace_events) = .; \ - *(_ftrace_events) \ + KEEP(*(_ftrace_events)) \ VMLINUX_SYMBOL(__stop_ftrace_events) = .; \ VMLINUX_SYMBOL(__start_ftrace_enum_maps) = .; \ - *(_ftrace_enum_map) \ + KEEP(*(_ftrace_enum_map)) \ VMLINUX_SYMBOL(__stop_ftrace_enum_maps) = .; #else #define FTRACE_EVENTS() @@ -169,8 +169,8 @@ #define _OF_TABLE_1(name) \ . = ALIGN(8); \ VMLINUX_SYMBOL(__##name##_of_table) = .; \ - *(__##name##_of_table) \ - *(__##name##_of_table_end) + KEEP(*(__##name##_of_table)) \ + KEEP(*(__##name##_of_table_end)) #define CLKSRC_OF_TABLES() OF_TABLE(CONFIG_CLKSRC_OF, clksrc) #define IRQCHIP_OF_MATCH_TABLE() OF_TABLE(CONFIG_IRQCHIP, irqchip) @@ -196,9 +196,14 @@ *(.dtb.init.rodata) \ VMLINUX_SYMBOL(__dtb_end) = .; -/* .data section */ +/* + * .data section + * -fdata-sections generates .data.identifier which needs to be pulled in + * with .data, but don't want to pull in .data..stuff which has its own + * requirements. Same for bss. + */ #define DATA_DATA \ - *(.data) \ + *(.data .data.[0-9a-zA-Z_]*) \ *(.ref.data) \ *(.data..shared_aligned) /* percpu related */ \ MEM_KEEP(init.data) \ @@ -281,28 +286,28 @@ /* PCI quirks */ \ .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \ - *(.pci_fixup_early) \ + KEEP(*(.pci_fixup_early)) \ VMLINUX_SYMBOL(__end_pci_fixups_early) = .; \ VMLINUX_SYMBOL(__start_pci_fixups_header) = .; \ - *(.pci_fixup_header) \ + KEEP(*(.pci_fixup_header)) \ VMLINUX_SYMBOL(__end_pci_fixups_header) = .; \ VMLINUX_SYMBOL(__start_pci_fixups_final) = .; \ - *(.pci_fixup_final) \ + KEEP(*(.pci_fixup_final)) \ VMLINUX_SYMBOL(__end_pci_fixups_final) = .; \ VMLINUX_SYMBOL(__start_pci_fixups_enable) = .; \ - *(.pci_fixup_enable) \ + KEEP(*(.pci_fixup_enable)) \ VMLINUX_SYMBOL(__end_pci_fixups_enable) = .; \ VMLINUX_SYMBOL(__start_pci_fixups_resume) = .; \ - *(.pci_fixup_resume) \ + KEEP(*(.pci_fixup_resume)) \ VMLINUX_SYMBOL(__end_pci_fixups_resume) = .; \ VMLINUX_SYMBOL(__start_pci_fixups_resume_early) = .; \ - *(.pci_fixup_resume_early) \ + KEEP(*(.pci_fixup_resume_early)) \ VMLINUX_SYMBOL(__end_pci_fixups_resume_early) = .; \ VMLINUX_SYMBOL(__start_pci_fixups_suspend) = .; \ - *(.pci_fixup_suspend) \ + KEEP(*(.pci_fixup_suspend)) \ VMLINUX_SYMBOL(__end_pci_fixups_suspend) = .; \ VMLINUX_SYMBOL(__start_pci_fixups_suspend_late) = .; \ - *(.pci_fixup_suspend_late) \ + KEEP(*(.pci_fixup_suspend_late)) \ VMLINUX_SYMBOL(__end_pci_fixups_suspend_late) = .; \ } \ \ @@ -318,76 +323,76 @@ /* Kernel symbol table: Normal symbols */ \ __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab) = .; \ - *(SORT(___ksymtab+*)) \ + KEEP(*(SORT(___ksymtab+*))) \ VMLINUX_SYMBOL(__stop___ksymtab) = .; \ } \ \ /* Kernel symbol table: GPL-only symbols */ \ __ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_gpl) = .; \ - *(SORT(___ksymtab_gpl+*)) \ + KEEP(*(SORT(___ksymtab_gpl+*))) \ VMLINUX_SYMBOL(__stop___ksymtab_gpl) = .; \ } \ \ /* Kernel symbol table: Normal unused symbols */ \ __ksymtab_unused : AT(ADDR(__ksymtab_unused) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_unused) = .; \ - *(SORT(___ksymtab_unused+*)) \ + KEEP(*(SORT(___ksymtab_unused+*))) \ VMLINUX_SYMBOL(__stop___ksymtab_unused) = .; \ } \ \ /* Kernel symbol table: GPL-only unused symbols */ \ __ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_unused_gpl) = .; \ - *(SORT(___ksymtab_unused_gpl+*)) \ + KEEP(*(SORT(___ksymtab_unused_gpl+*))) \ VMLINUX_SYMBOL(__stop___ksymtab_unused_gpl) = .; \ } \ \ /* Kernel symbol table: GPL-future-only symbols */ \ __ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_gpl_future) = .; \ - *(SORT(___ksymtab_gpl_future+*)) \ + KEEP(*(SORT(___ksymtab_gpl_future+*))) \ VMLINUX_SYMBOL(__stop___ksymtab_gpl_future) = .; \ } \ \ /* Kernel symbol table: Normal symbols */ \ __kcrctab : AT(ADDR(__kcrctab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab) = .; \ - *(SORT(___kcrctab+*)) \ + KEEP(*(SORT(___kcrctab+*))) \ VMLINUX_SYMBOL(__stop___kcrctab) = .; \ } \ \ /* Kernel symbol table: GPL-only symbols */ \ __kcrctab_gpl : AT(ADDR(__kcrctab_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_gpl) = .; \ - *(SORT(___kcrctab_gpl+*)) \ + KEEP(*(SORT(___kcrctab_gpl+*))) \ VMLINUX_SYMBOL(__stop___kcrctab_gpl) = .; \ } \ \ /* Kernel symbol table: Normal unused symbols */ \ __kcrctab_unused : AT(ADDR(__kcrctab_unused) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_unused) = .; \ - *(SORT(___kcrctab_unused+*)) \ + KEEP(*(SORT(___kcrctab_unused+*))) \ VMLINUX_SYMBOL(__stop___kcrctab_unused) = .; \ } \ \ /* Kernel symbol table: GPL-only unused symbols */ \ __kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_unused_gpl) = .; \ - *(SORT(___kcrctab_unused_gpl+*)) \ + KEEP(*(SORT(___kcrctab_unused_gpl+*))) \ VMLINUX_SYMBOL(__stop___kcrctab_unused_gpl) = .; \ } \ \ /* Kernel symbol table: GPL-future-only symbols */ \ __kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_gpl_future) = .; \ - *(SORT(___kcrctab_gpl_future+*)) \ + KEEP(*(SORT(___kcrctab_gpl_future+*))) \ VMLINUX_SYMBOL(__stop___kcrctab_gpl_future) = .; \ } \ \ /* Kernel symbol table: strings */ \ __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \ - *(__ksymtab_strings) \ + KEEP(*(__ksymtab_strings)) \ } \ \ /* __*init sections */ \ @@ -400,7 +405,7 @@ /* Built-in module parameters. */ \ __param : AT(ADDR(__param) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___param) = .; \ - *(__param) \ + KEEP(*(__param)) \ VMLINUX_SYMBOL(__stop___param) = .; \ } \ \ @@ -422,7 +427,7 @@ #define SECURITY_INIT \ .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__security_initcall_start) = .; \ - *(.security_initcall.init) \ + KEEP(*(.security_initcall.init)) \ VMLINUX_SYMBOL(__security_initcall_end) = .; \ } @@ -432,7 +437,7 @@ ALIGN_FUNCTION(); \ *(.text.hot .text.hot.*) \ *(.text .text.fixup) \ - *(.text.unlikely .text.unlikely.*) \ + *(.text.unlikely .text.unlikely .text.*) \ *(.text.unknown .text.unknown.*) \ *(.ref.text) \ *(.text.asan.* .text.tsan.*) \ @@ -481,7 +486,7 @@ VMLINUX_SYMBOL(__softirqentry_text_end) = .; /* Section used for early init (in .S files) */ -#define HEAD_TEXT *(.head.text) +#define HEAD_TEXT KEEP(*(.head.text)) #define HEAD_TEXT_SECTION \ .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { \ @@ -495,7 +500,7 @@ . = ALIGN(align); \ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ex_table) = .; \ - *(__ex_table) \ + KEEP(*(__ex_table)) \ VMLINUX_SYMBOL(__stop___ex_table) = .; \ } @@ -521,11 +526,12 @@ /* init and exit section handling */ #define INIT_DATA \ + KEEP(*(SORT(___kentry+*))) \ *(.init.data) \ MEM_DISCARD(init.data) \ KERNEL_CTORS() \ MCOUNT_REC() \ - *(.init.rodata) \ + *(.init.rodata .init.rodata.*) \ FTRACE_EVENTS() \ TRACE_SYSCALLS() \ KPROBE_BLACKLIST() \ @@ -543,7 +549,7 @@ EARLYCON_TABLE() #define INIT_TEXT \ - *(.init.text) \ + *(.init.text .init.text.*) \ *(.text.startup) \ MEM_DISCARD(init.text) @@ -560,7 +566,7 @@ MEM_DISCARD(exit.text) #define EXIT_CALL \ - *(.exitcall.exit) + KEEP(*(.exitcall.exit)) /* * bss (Block Started by Symbol) - uninitialized data @@ -587,7 +593,7 @@ BSS_FIRST_SECTIONS \ *(.bss..page_aligned) \ *(.dynbss) \ - *(.bss) \ + *(.bss .bss.[0-9a-zA-Z_]*) \ *(COMMON) \ } @@ -636,7 +642,7 @@ . = ALIGN(8); \ __bug_table : AT(ADDR(__bug_table) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___bug_table) = .; \ - *(__bug_table) \ + KEEP(*(__bug_table)) \ VMLINUX_SYMBOL(__stop___bug_table) = .; \ } #else @@ -665,17 +671,17 @@ #define INIT_SETUP(initsetup_align) \ . = ALIGN(initsetup_align); \ VMLINUX_SYMBOL(__setup_start) = .; \ - *(.init.setup) \ + KEEP(*(.init.setup)) \ VMLINUX_SYMBOL(__setup_end) = .; #define INIT_CALLS_LEVEL(level) \ VMLINUX_SYMBOL(__initcall##level##_start) = .; \ - *(.initcall##level##.init) \ - *(.initcall##level##s.init) \ + KEEP(*(.initcall##level##.init)) \ + KEEP(*(.initcall##level##s.init)) \ #define INIT_CALLS \ VMLINUX_SYMBOL(__initcall_start) = .; \ - *(.initcallearly.init) \ + KEEP(*(.initcallearly.init)) \ INIT_CALLS_LEVEL(0) \ INIT_CALLS_LEVEL(1) \ INIT_CALLS_LEVEL(2) \ @@ -689,21 +695,21 @@ #define CON_INITCALL \ VMLINUX_SYMBOL(__con_initcall_start) = .; \ - *(.con_initcall.init) \ + KEEP(*(.con_initcall.init)) \ VMLINUX_SYMBOL(__con_initcall_end) = .; #define SECURITY_INITCALL \ VMLINUX_SYMBOL(__security_initcall_start) = .; \ - *(.security_initcall.init) \ + KEEP(*(.security_initcall.init)) \ VMLINUX_SYMBOL(__security_initcall_end) = .; #ifdef CONFIG_BLK_DEV_INITRD #define INIT_RAM_FS \ . = ALIGN(4); \ VMLINUX_SYMBOL(__initramfs_start) = .; \ - *(.init.ramfs) \ + KEEP(*(.init.ramfs)) \ . = ALIGN(8); \ - *(.init.ramfs.info) + KEEP(*(.init.ramfs.info)) #else #define INIT_RAM_FS #endif diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d169d274e31b..26dd401e12ae 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -1,7 +1,6 @@ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H -#include <linux/bpf.h> #include <linux/jump_label.h> #include <uapi/linux/bpf.h> @@ -55,6 +54,9 @@ int __cgroup_bpf_run_filter(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type); + /* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) \ ({ \ @@ -78,6 +80,16 @@ int __cgroup_bpf_run_filter(struct sock *sk, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && sk) { \ + __ret = __cgroup_bpf_run_filter_sk(sk, \ + BPF_CGROUP_INET_SOCK_CREATE); \ + } \ + __ret; \ +}) + #else struct cgroup_bpf {}; @@ -86,6 +98,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8eb2e503fc3e..77476208f1b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -36,7 +36,10 @@ struct bpf_map_ops { }; struct bpf_map { - atomic_t refcnt; + /* 1st cacheline with read-mostly members of which some + * are also accessed in fast-path (e.g. ops, max_entries). + */ + const struct bpf_map_ops *ops ____cacheline_aligned; enum bpf_map_type map_type; u32 key_size; u32 value_size; @@ -44,10 +47,15 @@ struct bpf_map { u32 map_flags; u32 pages; bool unpriv_array; - struct user_struct *user; - const struct bpf_map_ops *ops; - struct work_struct work; + /* 7 bytes hole */ + + /* 2nd cacheline with misc members to avoid false sharing + * particularly with refcounting. + */ + struct user_struct *user ____cacheline_aligned; + atomic_t refcnt; atomic_t usercnt; + struct work_struct work; #ifdef CONFIG_SECURITY void *security; #endif @@ -299,6 +307,8 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i); struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); +int __bpf_prog_charge(struct user_struct *user, u32 pages); +void __bpf_prog_uncharge(struct user_struct *user, u32 pages); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); @@ -376,6 +386,16 @@ static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) { return ERR_PTR(-EOPNOTSUPP); } + +static inline int __bpf_prog_charge(struct user_struct *user, u32 pages) +{ + return 0; +} + +static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) +{ +} + static inline int bpf_obj_get_user(const char __user *pathname) { return -EOPNOTSUPP; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ee2772f769c2..d3036be98027 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -601,6 +601,7 @@ struct sock_cgroup_data { struct { u8 is_data : 1; u8 no_refcnt : 1; + u8 unused : 6; u8 padding; u16 prioidx; u32 classid; @@ -610,6 +611,7 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; + u8 unused : 6; u8 no_refcnt : 1; u8 is_data : 1; } __packed; @@ -642,7 +644,7 @@ static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; if (sock_cgroup_prioidx(&skcd_buf) == prioidx) return; @@ -659,7 +661,7 @@ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; if (sock_cgroup_classid(&skcd_buf) == classid) return; diff --git a/include/linux/compat.h b/include/linux/compat.h index 24dd42910d7c..c4bdce045492 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -5,6 +5,8 @@ * syscall compatibility layer. */ +#include <linux/types.h> + #ifdef CONFIG_COMPAT #include <linux/stat.h> @@ -711,9 +713,22 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32, int, const char __user *); + +/* + * For most but not all architectures, "am I in a compat syscall?" and + * "am I a compat task?" are the same question. For architectures on which + * they aren't the same question, arch code can override in_compat_syscall. + */ + +#ifndef in_compat_syscall +static inline bool in_compat_syscall(void) { return is_compat_task(); } +#endif + #else #define is_compat_task() (0) +static inline bool in_compat_syscall(void) { return false; } #endif /* CONFIG_COMPAT */ + #endif /* _LINUX_COMPAT_H */ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index bc8077e5e688..759fb028acc8 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -201,6 +201,29 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # define unreachable() do { } while (1) #endif +/* + * KENTRY - kernel entry point + * This can be used to annotate symbols (functions or data) that are used + * without their linker symbol being referenced explicitly. For example, + * interrupt vector handlers, or functions in the kernel image that are found + * programatically. + * + * Not required for symbols exported with EXPORT_SYMBOL, or initcalls. Those + * are handled in their own way (with KEEP() in linker scripts). + * + * KENTRY can be avoided if the symbols in question are marked as KEEP() in the + * linker script. For example an architecture could KEEP() its entire + * boot/exception vector code rather than annotate each function and data. + */ +#ifndef KENTRY +# define KENTRY(sym) \ + extern typeof(sym) sym; \ + static const unsigned long __kentry_##sym \ + __used \ + __attribute__((section("___kentry" "+" #sym ), used)) \ + = (unsigned long)&sym; +#endif + #ifndef RELOC_HIDE # define RELOC_HIDE(ptr, off) \ ({ unsigned long __ptr; \ diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 19c066dce1da..51362ff607fc 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -60,9 +60,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent); struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *dest); +typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, - struct vfsmount *(*f)(void *), + debugfs_automount_t f, void *data); void debugfs_remove(struct dentry *dentry); diff --git a/include/linux/export.h b/include/linux/export.h index 96e45ea463e7..a84c76fdebd8 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -1,5 +1,6 @@ #ifndef _LINUX_EXPORT_H #define _LINUX_EXPORT_H + /* * Export symbols from the kernel to modules. Forked from module.h * to reduce the amount of pointless cruft we feed to gcc when only @@ -42,11 +43,11 @@ extern struct module __this_module; #ifdef CONFIG_MODVERSIONS /* Mark the CRC weak since genksyms apparently decides not to * generate a checksums for some symbols */ -#define __CRC_SYMBOL(sym, sec) \ - extern __visible void *__crc_##sym __attribute__((weak)); \ - static const unsigned long __kcrctab_##sym \ - __used \ - __attribute__((section("___kcrctab" sec "+" #sym), unused)) \ +#define __CRC_SYMBOL(sym, sec) \ + extern __visible void *__crc_##sym __attribute__((weak)); \ + static const unsigned long __kcrctab_##sym \ + __used \ + __attribute__((section("___kcrctab" sec "+" #sym), used)) \ = (unsigned long) &__crc_##sym; #else #define __CRC_SYMBOL(sym, sec) @@ -59,8 +60,7 @@ extern struct module __this_module; static const char __kstrtab_##sym[] \ __attribute__((section("__ksymtab_strings"), aligned(1))) \ = VMLINUX_SYMBOL_STR(sym); \ - extern const struct kernel_symbol __ksymtab_##sym; \ - __visible const struct kernel_symbol __ksymtab_##sym \ + static const struct kernel_symbol __ksymtab_##sym \ __used \ __attribute__((section("___ksymtab" sec "+" #sym), unused)) \ = { (unsigned long)&sym, __kstrtab_##sym } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a85d7b71e329..c0b08bbe85b2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -244,8 +244,16 @@ static inline int ftrace_function_local_disabled(struct ftrace_ops *ops) return *this_cpu_ptr(ops->disabled); } +#ifdef CONFIG_CFI_CLANG +/* Use a C stub with the correct type for CFI */ +static inline void ftrace_stub(unsigned long a0, unsigned long a1, + struct ftrace_ops *op, struct pt_regs *regs) +{ +} +#else extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op, struct pt_regs *regs); +#endif #else /* !CONFIG_FUNCTION_TRACER */ /* diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a5f251d76018..858031d58899 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -535,7 +535,7 @@ extern void free_kmem_pages(unsigned long addr, unsigned int order); void page_alloc_init(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); -void drain_local_pages(struct zone *zone); +void drain_local_pages(void *zone); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT void page_alloc_init_late(void); diff --git a/include/linux/init.h b/include/linux/init.h index 3561ea30ed8c..9ae494724b1b 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -163,24 +163,8 @@ extern bool initcall_debug; #ifndef __ASSEMBLY__ -#ifdef CONFIG_LTO -/* Work around a LTO gcc problem: when there is no reference to a variable - * in a module it will be moved to the end of the program. This causes - * reordering of initcalls which the kernel does not like. - * Add a dummy reference function to avoid this. The function is - * deleted by the linker. - */ -#define LTO_REFERENCE_INITCALL(x) \ - ; /* yes this is needed */ \ - static __used __exit void *reference_##x(void) \ - { \ - return &x; \ - } -#else -#define LTO_REFERENCE_INITCALL(x) -#endif - -/* initcalls are now grouped by functionality into separate +/* + * initcalls are now grouped by functionality into separate * subsections. Ordering inside the subsections is determined * by link order. * For backwards compatibility, initcall() puts the call in @@ -188,12 +172,16 @@ extern bool initcall_debug; * * The `id' arg to __define_initcall() is needed so that multiple initcalls * can point at the same handler without causing duplicate-symbol build errors. + * + * Initcalls are run by placing pointers in initcall sections that the + * kernel iterates at runtime. The linker can do dead code / data elimination + * and remove that completely, so the initcall sections have to be marked + * as KEEP() in the linker script. */ #define __define_initcall(fn, id) \ static initcall_t __initcall_##fn##id __used \ - __attribute__((__section__(".initcall" #id ".init"))) = fn; \ - LTO_REFERENCE_INITCALL(__initcall_##fn##id) + __attribute__((__section__(".initcall" #id ".init"))) = fn; /* * Early initcalls run before initializing SMP. @@ -229,15 +217,15 @@ extern bool initcall_debug; #define __initcall(fn) device_initcall(fn) -#define __exitcall(fn) \ +#define __exitcall(fn) \ static exitcall_t __exitcall_##fn __exit_call = fn -#define console_initcall(fn) \ - static initcall_t __initcall_##fn \ +#define console_initcall(fn) \ + static initcall_t __initcall_##fn \ __used __section(.con_initcall.init) = fn -#define security_initcall(fn) \ - static initcall_t __initcall_##fn \ +#define security_initcall(fn) \ + static initcall_t __initcall_##fn \ __used __section(.security_initcall.init) = fn struct obs_kernel_param { diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 52666d90ca94..060db5c327be 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -225,19 +225,11 @@ struct kparam_array VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } } /* Obsolete - use module_param_cb() */ -#define module_param_call(name, set, get, arg, perm) \ - static const struct kernel_param_ops __param_ops_##name = \ - { .flags = 0, (void *)set, (void *)get }; \ +#define module_param_call(name, _set, _get, arg, perm) \ + static const struct kernel_param_ops __param_ops_##name = \ + { .flags = 0, .set = _set, .get = _get }; \ __module_param_call(MODULE_PARAM_PREFIX, \ - name, &__param_ops_##name, arg, \ - (perm) + sizeof(__check_old_set_param(set))*0, -1, 0) - -/* We don't get oldget: it's often a new-style param_get_uint, etc. */ -static inline int -__check_old_set_param(int (*oldset)(const char *, struct kernel_param *)) -{ - return 0; -} + name, &__param_ops_##name, arg, perm, -1, 0) #ifdef CONFIG_SYSFS extern void kernel_param_lock(struct module *mod); diff --git a/include/linux/mount.h b/include/linux/mount.h index b606d8f57adf..bbe48613cc80 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -90,6 +90,9 @@ struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); +extern struct vfsmount *vfs_submount(const struct dentry *mountpoint, + struct file_system_type *type, + const char *name, void *data); extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d2f4a732b3e8..ca7b7cbcb23a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -243,7 +243,7 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x) __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN); } -typedef int filler_t(void *, struct page *); +typedef int filler_t(struct file *, struct page *); pgoff_t page_cache_next_hole(struct address_space *mapping, pgoff_t index, unsigned long max_scan); @@ -392,7 +392,7 @@ extern int read_cache_pages(struct address_space *mapping, static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) { - filler_t *filler = (filler_t *)mapping->a_ops->readpage; + filler_t *filler = mapping->a_ops->readpage; return read_cache_page(mapping, index, filler, data); } diff --git a/include/linux/pid.h b/include/linux/pid.h index 97b745ddece5..1d847d3de245 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -2,6 +2,7 @@ #define _LINUX_PID_H #include <linux/rcupdate.h> +#include <linux/wait.h> enum pid_type { @@ -62,6 +63,8 @@ struct pid unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; + /* wait queue for pidfd notifications */ + wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[1]; }; @@ -74,6 +77,8 @@ struct pid_link struct pid *pid; }; +extern const struct file_operations pidfd_fops; + static inline struct pid *get_pid(struct pid *pid) { if (pid) diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index b326d0a0cace..de3f442b7ab6 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -41,6 +41,7 @@ extern void *proc_get_parent_data(const struct inode *); extern void proc_remove(struct proc_dir_entry *); extern void remove_proc_entry(const char *, struct proc_dir_entry *); extern int remove_proc_subtree(const char *, struct proc_dir_entry *); +extern struct pid *tgid_pidfd_to_pid(const struct file *file); #else /* CONFIG_PROC_FS */ @@ -72,6 +73,11 @@ static inline void proc_remove(struct proc_dir_entry *de) {} #define remove_proc_entry(name, parent) do {} while (0) static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; } +static inline struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + return ERR_PTR(-EBADF); +} + #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_PROC_UID diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 19bac77f81b1..8020dca6c61f 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -98,6 +98,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4f2933707b13..0c62b1d98ea0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -222,6 +222,11 @@ struct sk_buff; #endif extern int sysctl_max_skb_frags; +/* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to + * segment using its current segmentation instead. + */ +#define GSO_BY_FRAGS 0xFFFF + typedef struct skb_frag_struct skb_frag_t; struct skb_frag_struct { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 5d2779aa4bbe..cf569d676909 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -879,6 +879,7 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); asmlinkage long sys_setns(int fd, int nstype); +asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags); asmlinkage long sys_process_vm_readv(pid_t pid, const struct iovec __user *lvec, unsigned long liovcnt, @@ -909,4 +910,8 @@ asmlinkage long sys_membarrier(int cmd, int flags); asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); +asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, + siginfo_t __user *info, + unsigned int flags); + #endif diff --git a/include/linux/udp.h b/include/linux/udp.h index 87c094961bd5..32342754643a 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -98,11 +98,11 @@ static inline bool udp_get_no_check6_rx(struct sock *sk) return udp_sk(sk)->no_check6_rx; } -#define udp_portaddr_for_each_entry(__sk, node, list) \ - hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node) +#define udp_portaddr_for_each_entry(__sk, list) \ + hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node) -#define udp_portaddr_for_each_entry_rcu(__sk, node, list) \ - hlist_nulls_for_each_entry_rcu(__sk, node, list, __sk_common.skc_portaddr_node) +#define udp_portaddr_for_each_entry_rcu(__sk, list) \ + hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) #define IS_UDPLITE(__sk) (udp_sk(__sk)->pcflag) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 07aed691fe00..fa06e7418f41 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -90,6 +90,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); +int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard); int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 7ff588ca6817..28332bdac333 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -53,6 +53,7 @@ struct sock *__inet6_lookup_established(struct net *net, struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -60,6 +61,7 @@ struct sock *inet6_lookup_listener(struct net *net, static inline struct sock *__inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -71,12 +73,12 @@ static inline struct sock *__inet6_lookup(struct net *net, if (sk) return sk; - return inet6_lookup_listener(net, hashinfo, saddr, sport, + return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif); } static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, + struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif) @@ -86,16 +88,19 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, - &ipv6_hdr(skb)->saddr, sport, + return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + doff, &ipv6_hdr(skb)->saddr, sport, &ipv6_hdr(skb)->daddr, ntohs(dport), iif); } struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); + +int inet6_hash(struct sock *sk); #endif /* IS_ENABLED(CONFIG_IPV6) */ #define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif) \ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index de2e3ade6102..50f635c2c536 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -207,12 +207,16 @@ void inet_hashinfo_init(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); -void __inet_hash(struct sock *sk, struct sock *osk); -void inet_hash(struct sock *sk); +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)); +int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const unsigned short hnum, @@ -220,10 +224,11 @@ struct sock *__inet_lookup_listener(struct net *net, static inline struct sock *inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __inet_lookup_listener(net, hashinfo, saddr, sport, + return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif); } @@ -299,6 +304,7 @@ static inline struct sock * static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) @@ -307,12 +313,13 @@ static inline struct sock *__inet_lookup(struct net *net, struct sock *sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif); - return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport, - daddr, hnum, dif); + return sk ? : __inet_lookup_listener(net, hashinfo, skb, doff, saddr, + sport, daddr, hnum, dif); } static inline struct sock *inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) @@ -320,7 +327,8 @@ static inline struct sock *inet_lookup(struct net *net, struct sock *sk; local_bh_disable(); - sk = __inet_lookup(net, hashinfo, saddr, sport, daddr, dport, dif); + sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + dport, dif); local_bh_enable(); return sk; @@ -328,6 +336,7 @@ static inline struct sock *inet_lookup(struct net *net, static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, + int doff, const __be16 sport, const __be16 dport) { @@ -337,8 +346,8 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; else - return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, - iph->saddr, sport, + return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb)); } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 3d4fb0e2a3bf..798e5f95494b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -374,6 +374,17 @@ static inline void ip_tunnel_unneed_metadata(void) { } +static inline void ip_tunnel_info_opts_get(void *to, + const struct ip_tunnel_info *info) +{ +} + +static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, + const void *from, int len) +{ + info->options_len = 0; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index e3f73fd1d53a..a42d65ca0c27 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -286,7 +286,7 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) struct kernel_param; -int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); +int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp); extern unsigned int nf_conntrack_htable_size; extern unsigned int nf_conntrack_max; extern unsigned int nf_conntrack_hash_rnd; diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h index 68e509750caa..039cc29cb4a8 100644 --- a/include/net/phonet/phonet.h +++ b/include/net/phonet/phonet.h @@ -51,7 +51,7 @@ void pn_sock_init(void); struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa); void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb); void phonet_get_local_port_range(int *min, int *max); -void pn_sock_hash(struct sock *sk); +int pn_sock_hash(struct sock *sk); void pn_sock_unhash(struct sock *sk); int pn_sock_get_port(struct sock *sk, unsigned short sport); diff --git a/include/net/ping.h b/include/net/ping.h index ac80cb45e630..5fd7cc244833 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -65,7 +65,7 @@ struct pingfakehdr { }; int ping_get_port(struct sock *sk, unsigned short ident); -void ping_hash(struct sock *sk); +int ping_hash(struct sock *sk); void ping_unhash(struct sock *sk); int ping_init_sock(struct sock *sk); diff --git a/include/net/raw.h b/include/net/raw.h index 6a40c6562dd2..3e789008394d 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -57,7 +57,7 @@ int raw_seq_open(struct inode *ino, struct file *file, #endif -void raw_hash_sk(struct sock *sk); +int raw_hash_sk(struct sock *sk); void raw_unhash_sk(struct sock *sk); struct raw_sock { diff --git a/include/net/sock.h b/include/net/sock.h index c8d332b75a40..4c417486a1b8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -193,7 +193,7 @@ struct sock_common { int skc_bound_dev_if; union { struct hlist_node skc_bind_node; - struct hlist_nulls_node skc_portaddr_node; + struct hlist_node skc_portaddr_node; }; struct proto *skc_prot; possible_net_t skc_net; @@ -699,18 +699,18 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_for_each_entry(__sk, list, sk_bind_node) /** - * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset + * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @offset: offset of hlist_node within the struct. * */ -#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset) \ - for (pos = (head)->first; \ - (!is_a_nulls(pos)) && \ +#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ + for (pos = rcu_dereference((head)->first); \ + pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ - pos = pos->next) + pos = rcu_dereference(pos->next)) static inline struct user_namespace *sk_user_ns(struct sock *sk) { @@ -1019,7 +1019,7 @@ struct proto { void (*release_cb)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ - void (*hash)(struct sock *sk); + int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); @@ -1348,10 +1348,10 @@ static inline void sock_prot_inuse_add(struct net *net, struct proto *prot, /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. */ -static inline void __sk_prot_rehash(struct sock *sk) +static inline int __sk_prot_rehash(struct sock *sk) { sk->sk_prot->unhash(sk); - sk->sk_prot->hash(sk); + return sk->sk_prot->hash(sk); } void sk_prot_clear_portaddr_nulls(struct sock *sk, int size); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 7dda3d7adba8..aecd30308d50 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -16,7 +16,7 @@ struct sock_reuseport { }; extern int reuseport_alloc(struct sock *sk); -extern int reuseport_add_sock(struct sock *sk, const struct sock *sk2); +extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); extern void reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, diff --git a/include/net/udp.h b/include/net/udp.h index 13bd641be453..3f52b18ea12e 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -59,7 +59,7 @@ struct udp_skb_cb { * @lock: spinlock protecting changes to head/count */ struct udp_hslot { - struct hlist_nulls_head head; + struct hlist_head head; int count; spinlock_t lock; } __attribute__((aligned(2 * sizeof(long)))); @@ -177,9 +177,10 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ -static inline void udp_lib_hash(struct sock *sk) +static inline int udp_lib_hash(struct sock *sk) { BUG(); + return 0; } void udp_lib_unhash(struct sock *sk); diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index c72f2dc01d0b..63a7680347cb 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -34,6 +34,83 @@ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) +TRACE_EVENT(locks_get_lock_context, + TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), + + TP_ARGS(inode, type, ctx), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(unsigned char, type) + __field(struct file_lock_context *, ctx) + ), + + TP_fast_assign( + __entry->s_dev = inode->i_sb->s_dev; + __entry->i_ino = inode->i_ino; + __entry->type = type; + __entry->ctx = ctx; + ), + + TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), + __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) +); + +DECLARE_EVENT_CLASS(filelock_lock, + TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), + + TP_ARGS(inode, fl, ret), + + TP_STRUCT__entry( + __field(struct file_lock *, fl) + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(struct file_lock *, fl_next) + __field(fl_owner_t, fl_owner) + __field(unsigned int, fl_pid) + __field(unsigned int, fl_flags) + __field(unsigned char, fl_type) + __field(loff_t, fl_start) + __field(loff_t, fl_end) + __field(int, ret) + ), + + TP_fast_assign( + __entry->fl = fl ? fl : NULL; + __entry->s_dev = inode->i_sb->s_dev; + __entry->i_ino = inode->i_ino; + __entry->fl_next = fl ? fl->fl_next : NULL; + __entry->fl_owner = fl ? fl->fl_owner : NULL; + __entry->fl_pid = fl ? fl->fl_pid : 0; + __entry->fl_flags = fl ? fl->fl_flags : 0; + __entry->fl_type = fl ? fl->fl_type : 0; + __entry->fl_start = fl ? fl->fl_start : 0; + __entry->fl_end = fl ? fl->fl_end : 0; + __entry->ret = ret; + ), + + TP_printk("fl=0x%p dev=0x%x:0x%x ino=0x%lx fl_next=0x%p fl_owner=0x%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", + __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), + __entry->i_ino, __entry->fl_next, __entry->fl_owner, + __entry->fl_pid, show_fl_flags(__entry->fl_flags), + show_fl_type(__entry->fl_type), + __entry->fl_start, __entry->fl_end, __entry->ret) +); + +DEFINE_EVENT(filelock_lock, posix_lock_inode, + TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), + TP_ARGS(inode, fl, ret)); + +DEFINE_EVENT(filelock_lock, fcntl_setlk, + TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), + TP_ARGS(inode, fl, ret)); + +DEFINE_EVENT(filelock_lock, locks_remove_posix, + TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), + TP_ARGS(inode, fl, ret)); + DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 1324b0292ec2..a6b339e1d70e 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -715,9 +715,13 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd) __SYSCALL(__NR_membarrier, sys_membarrier) #define __NR_mlock2 284 __SYSCALL(__NR_mlock2, sys_mlock2) +#define __NR_pidfd_send_signal 424 +__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) +#define __NR_pidfd_open 434 +__SYSCALL(__NR_pidfd_open, sys_pidfd_open) #undef __NR_syscalls -#define __NR_syscalls 285 +#define __NR_syscalls 435 /* * All syscalls below here should go away really, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b8effe6bf417..bbbf36a963e6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -99,11 +99,13 @@ enum bpf_prog_type { BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_CGROUP_SOCK, }; enum bpf_attach_type { BPF_CGROUP_INET_INGRESS, BPF_CGROUP_INET_EGRESS, + BPF_CGROUP_INET_SOCK_CREATE, __MAX_BPF_ATTACH_TYPE }; @@ -627,6 +629,10 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +struct bpf_sock { + __u32 bound_dev_if; +}; + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index c66b52303114..51caeb7797d0 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -93,6 +93,7 @@ struct inodes_stat_t { #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ +#define MS_SUBMOUNT (1<<26) #define MS_NOSEC (1<<28) #define MS_BORN (1<<29) #define MS_ACTIVE (1<<30) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 5f0fe019a720..ed6e31d9c195 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -9,6 +9,7 @@ #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */ #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ diff --git a/init/Kconfig b/init/Kconfig index af433ff5356f..a5ae0470b7b2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1462,9 +1462,6 @@ endchoice config SYSCTL bool -config ANON_INODES - bool - config HAVE_UID16 bool @@ -1654,14 +1651,12 @@ config HAVE_FUTEX_CMPXCHG config EPOLL bool "Enable eventpoll support" if EXPERT default y - select ANON_INODES help Disabling this option will cause the kernel to be built without support for epoll family of system calls. config SIGNALFD bool "Enable signalfd() system call" if EXPERT - select ANON_INODES default y help Enable the signalfd() system call that allows to receive signals @@ -1671,7 +1666,6 @@ config SIGNALFD config TIMERFD bool "Enable timerfd() system call" if EXPERT - select ANON_INODES default y help Enable the timerfd() system call that allows to receive timer @@ -1681,7 +1675,6 @@ config TIMERFD config EVENTFD bool "Enable eventfd() system call" if EXPERT - select ANON_INODES default y help Enable the eventfd() system call that allows to receive both @@ -1692,7 +1685,6 @@ config EVENTFD # syscall, maps, verifier config BPF_SYSCALL bool "Enable bpf() system call" - select ANON_INODES select BPF default n help @@ -1747,7 +1739,6 @@ config BPF_UNPRIV_DEFAULT_OFF config USERFAULTFD bool "Enable userfaultfd() system call" - select ANON_INODES depends on MMU help Enable the userfaultfd() system call that allows to intercept and @@ -1799,7 +1790,6 @@ config PERF_EVENTS bool "Kernel performance events and counters" default y if PROFILING depends on HAVE_PERF_EVENTS - select ANON_INODES select IRQ_WORK select SRCU help diff --git a/init/Makefile b/init/Makefile index 243f61de2cba..d210b235c5d7 100644 --- a/init/Makefile +++ b/init/Makefile @@ -2,6 +2,8 @@ # Makefile for the linux kernel. # +ccflags-y := -fno-function-sections -fno-data-sections + obj-y := main.o version.o mounts.o obj-y += noinitramfs.o obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 357ce8355d57..8210c7dd7532 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -424,3 +424,36 @@ int __cgroup_bpf_run_filter(struct sock *sk, return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter); + +/** + * __cgroup_bpf_run_filter_sk() - Run a program on a sock + * @sk: sock structure to manipulate + * @type: The type of program to be exectuted + * + * socket is passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]->progs[0]); + if (prog) + ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 95ffe1fac0bf..2b1a925489cf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -60,11 +60,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns { u8 *ptr = NULL; - if (k >= SKF_NET_OFF) + if (k >= SKF_NET_OFF) { ptr = skb_network_header(skb) + k - SKF_NET_OFF; - else if (k >= SKF_LL_OFF) + } else if (k >= SKF_LL_OFF) { + if (unlikely(!skb_mac_header_was_set(skb))) + return NULL; ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - + } if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; @@ -105,19 +107,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; + u32 pages, delta; + int ret; BUG_ON(fp_old == NULL); size = round_up(size, PAGE_SIZE); - if (size <= fp_old->pages * PAGE_SIZE) + pages = size / PAGE_SIZE; + if (pages <= fp_old->pages) return fp_old; + delta = pages - fp_old->pages; + ret = __bpf_prog_charge(fp_old->aux->user, delta); + if (ret) + return NULL; + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); - if (fp != NULL) { + if (fp == NULL) { + __bpf_prog_uncharge(fp_old->aux->user, delta); + } else { kmemcheck_annotate_bitfield(fp, meta); memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); - fp->pages = size / PAGE_SIZE; + fp->pages = pages; fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a73a056ccd88..f65bd2dc07f8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -652,19 +652,39 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +int __bpf_prog_charge(struct user_struct *user, u32 pages) +{ + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + unsigned long user_bufs; + + if (user) { + user_bufs = atomic_long_add_return(pages, &user->locked_vm); + if (user_bufs > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); + return -EPERM; + } + } + + return 0; +} + +void __bpf_prog_uncharge(struct user_struct *user, u32 pages) +{ + if (user) + atomic_long_sub(pages, &user->locked_vm); +} + static int bpf_prog_charge_memlock(struct bpf_prog *prog) { struct user_struct *user = get_current_user(); - unsigned long memlock_limit; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + int ret; - atomic_long_add(prog->pages, &user->locked_vm); - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(prog->pages, &user->locked_vm); + ret = __bpf_prog_charge(user, prog->pages); + if (ret) { free_uid(user); - return -EPERM; + return ret; } + prog->aux->user = user; return 0; } @@ -673,7 +693,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) { struct user_struct *user = prog->aux->user; - atomic_long_sub(prog->pages, &user->locked_vm); + __bpf_prog_uncharge(user, prog->pages); free_uid(user); } @@ -933,7 +953,24 @@ static int bpf_prog_attach(const union bpf_attr *attr) bpf_prog_put(prog); cgroup_put(cgrp); break; + case BPF_CGROUP_INET_SOCK_CREATE: + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_CGROUP_SOCK); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); + if (ret) + bpf_prog_put(prog); + cgroup_put(cgrp); + break; default: return -EINVAL; } @@ -961,7 +998,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_EGRESS: ptype = BPF_PROG_TYPE_CGROUP_SKB; break; - + case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; default: return -EINVAL; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d36c4f914a1e..c2508ca442b7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1544,6 +1544,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, i, ret; + u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1560,8 +1561,28 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; + + /* + * Collect ssid's that need to be disabled from default + * hierarchy. + */ + if (ss->root == &cgrp_dfl_root) + dfl_disable_ss_mask |= 1 << ssid; + } while_each_subsys_mask(); + if (dfl_disable_ss_mask) { + struct cgroup *scgrp = &cgrp_dfl_root.cgrp; + + /* + * Controllers from default hierarchy that need to be rebound + * are all disabled together in one go. + */ + cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1570,10 +1591,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) WARN_ON(!css || cgroup_css(dcgrp, ss)); - /* disable from the source */ - src_root->subsys_mask &= ~(1 << ssid); - WARN_ON(cgroup_apply_control(scgrp)); - cgroup_finalize_control(scgrp, 0); + if (src_root != &cgrp_dfl_root) { + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); diff --git a/kernel/exit.c b/kernel/exit.c index babbc3c0a181..052aa52de331 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -616,6 +616,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); + tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && diff --git a/kernel/fork.c b/kernel/fork.c index 92a0df862115..a21adc0155b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -11,6 +11,7 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ +#include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/unistd.h> @@ -38,6 +39,7 @@ #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/seccomp.h> +#include <linux/seq_file.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> @@ -1294,6 +1296,84 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) task->pids[type].pid = pid; } +static int pidfd_release(struct inode *inode, struct file *file) +{ + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); + return 0; +} + +#ifdef CONFIG_PROC_FS +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid_namespace *ns = file_inode(m->file)->i_sb->s_fs_info; + struct pid *pid = f->private_data; + + seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); + seq_putc(m, '\n'); +} +#endif + +/* + * Poll support for process exit notification. + */ +static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) +{ + struct task_struct *task; + struct pid *pid = file->private_data; + int poll_flags = 0; + + poll_wait(file, &pid->wait_pidfd, pts); + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + /* + * Inform pollers only when the whole thread group exits. + * If the thread group leader exits before all other threads in the + * group, then poll(2) should block, similar to the wait(2) family. + */ + if (!task || (task->exit_state && thread_group_empty(task))) + poll_flags = POLLIN | POLLRDNORM; + rcu_read_unlock(); + + return poll_flags; +} + +const struct file_operations pidfd_fops = { + .release = pidfd_release, + .poll = pidfd_poll, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1305,13 +1385,14 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, + int __user *parent_tidptr, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node) { - int retval; + int pidfd = -1, retval; struct task_struct *p; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) @@ -1356,6 +1437,31 @@ static struct task_struct *copy_process(unsigned long clone_flags, return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_PIDFD) { + int reserved; + + /* + * - CLONE_PARENT_SETTID is useless for pidfds and also + * parent_tidptr is used to return pidfds. + * - CLONE_DETACHED is blocked so that we can potentially + * reuse it later for CLONE_PIDFD. + * - CLONE_THREAD is blocked until someone really needs it. + */ + if (clone_flags & + (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) + return ERR_PTR(-EINVAL); + + /* + * Verify that parent_tidptr is sane so we can potentially + * reuse it later. + */ + if (get_user(reserved, parent_tidptr)) + return ERR_PTR(-EFAULT); + + if (reserved != 0) + return ERR_PTR(-EINVAL); + } + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -1538,6 +1644,22 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } + /* + * This has to happen after we've potentially unshared the file + * descriptor table (so that the pidfd doesn't leak into the child + * if the fd table isn't shared). + */ + if (clone_flags & CLONE_PIDFD) { + retval = pidfd_create(pid); + if (retval < 0) + goto bad_fork_free_pid; + + pidfd = retval; + retval = put_user(pidfd, parent_tidptr); + if (retval) + goto bad_fork_put_pidfd; + } + #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1587,7 +1709,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_free_pid; + goto bad_fork_cgroup_threadgroup_change_end; /* * From this point on we must avoid any synchronous user-space @@ -1698,8 +1820,12 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); +bad_fork_cgroup_threadgroup_change_end: + cgroup_threadgroup_change_end(current); +bad_fork_put_pidfd: + if (clone_flags & CLONE_PIDFD) + sys_close(pidfd); bad_fork_free_pid: - threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: @@ -1754,7 +1880,7 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task->pids); @@ -1799,7 +1925,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, + p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); /* * Do this prior waking up the new thread - the thread pointer diff --git a/kernel/pid.c b/kernel/pid.c index ccfdb56321c6..9bc06f3a2b54 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -38,6 +38,8 @@ #include <linux/syscalls.h> #include <linux/proc_ns.h> #include <linux/proc_fs.h> +#include <linux/anon_inodes.h> +//#include <linux/sched/signal.h> #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -331,6 +333,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) @@ -564,6 +568,76 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return pid; } +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + +/** + * pidfd_open() - Open new pid file descriptor. + * + * @pid: pid for which to retrieve a pidfd + * @flags: flags to pass + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set for + * the process identified by @pid. Currently, the process identified by + * @pid must be a thread-group leader. This restriction currently exists + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot + * be used with CLONE_THREAD) and pidfd polling (only supports thread group + * leaders). + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) +{ + int fd, ret; + struct pid *p; + struct task_struct *tsk; + + if (flags) + return -EINVAL; + + if (pid <= 0) + return -EINVAL; + + p = find_get_pid(pid); + if (!p) + return -ESRCH; + + ret = 0; + rcu_read_lock(); + tsk = pid_task(p, PIDTYPE_PID); + /* Check that pid belongs to a group leader task */ + if (!tsk || !thread_group_leader(tsk)) + ret = -EINVAL; + rcu_read_unlock(); + + fd = ret ?: pidfd_create(p); + put_pid(p); + return fd; +} + /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or diff --git a/kernel/signal.c b/kernel/signal.c index a699055ebfe8..a9697e189a58 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -14,7 +14,9 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/file.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> @@ -1632,6 +1634,14 @@ ret: return ret; } +static void do_notify_pidfd(struct task_struct *task) +{ + struct pid *pid; + + pid = task_pid(task); + wake_up_all(&pid->wait_pidfd); +} + /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. @@ -1655,6 +1665,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + /* Wake up all pidfd waiters */ + do_notify_pidfd(tsk); + if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. @@ -2922,6 +2935,15 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +static inline void prepare_kill_siginfo(int sig, struct siginfo *info) +{ + info->si_signo = sig; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_tgid_vnr(current); + info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); +} + /** * sys_kill - send a signal to a process * @pid: the PID of the process @@ -2931,15 +2953,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info); return kill_something_info(sig, &info, pid); } +/* + * Verify that the signaler and signalee either are in the same pid namespace + * or that the signaler's pid namespace is an ancestor of the signalee's pid + * namespace. + */ +static bool access_pidfd_pidns(struct pid *pid) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *p = ns_of_pid(pid); + + for (;;) { + if (!p) + return false; + if (p == active) + break; + p = p->parent; + } + + return true; +} + +static struct pid *pidfd_to_pid(const struct file *file) +{ + if (file->f_op == &pidfd_fops) + return file->private_data; + + return tgid_pidfd_to_pid(file); +} + +static int copy_siginfo_from_user_any(siginfo_t *kinfo, siginfo_t __user *info) +{ +#ifdef CONFIG_COMPAT + /* + * Avoid hooking up compat syscalls and instead handle necessary + * conversions here. Note, this is a stop-gap measure and should not be + * considered a generic solution. + */ + if (in_compat_syscall()) + return copy_siginfo_from_user32( + kinfo, (struct compat_siginfo __user *)info); +#endif + return copy_from_user(kinfo, info, sizeof(siginfo_t)); +} + +/** + * sys_pidfd_send_signal - Signal a process through a pidfd + * @pidfd: file descriptor of the process + * @sig: signal to send + * @info: signal info + * @flags: future flags + * + * The syscall currently only signals via PIDTYPE_PID which covers + * kill(<positive-pid>, <signal>. It does not signal threads or process + * groups. + * In order to extend the syscall to threads and process groups the @flags + * argument should be used. In essence, the @flags argument will determine + * what is signaled and not the file descriptor itself. Put in other words, + * grouping is a property of the flags argument not a property of the file + * descriptor. + * + * Return: 0 on success, negative errno on failure + */ +SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, + siginfo_t __user *, info, unsigned int, flags) +{ + int ret; + struct fd f; + struct pid *pid; + siginfo_t kinfo; + + /* Enforce flags be set to 0 until we add an extension. */ + if (flags) + return -EINVAL; + + f = fdget(pidfd); + if (!f.file) + return -EBADF; + + /* Is this a pidfd? */ + pid = pidfd_to_pid(f.file); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto err; + } + + ret = -EINVAL; + if (!access_pidfd_pidns(pid)) + goto err; + + if (info) { + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + goto err; + + ret = -EINVAL; + if (unlikely(sig != kinfo.si_signo)) + goto err; + + /* Only allow sending arbitrary signals to yourself. */ + ret = -EPERM; + if ((task_pid(current) != pid) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) + goto err; + } else { + prepare_kill_siginfo(sig, &kinfo); + } + + ret = kill_pid_info(sig, &kinfo, pid); + +err: + fdput(f); + return ret; +} + static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c5484723abda..74fa302d9a68 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -128,8 +128,9 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs); #else /* See comment below, where ftrace_ops_list_func is defined */ -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); -#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); +#define ftrace_ops_list_func ftrace_ops_no_ops #endif /* @@ -5229,7 +5230,8 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, } NOKPROBE_SYMBOL(ftrace_ops_list_func); #else -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } @@ -5677,14 +5679,17 @@ void ftrace_graph_graph_time_control(bool enable) fgraph_graph_time = enable; } +void ftrace_graph_return_stub(struct ftrace_graph_ret *trace) +{ +} + int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { return 0; } /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_return_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -5912,7 +5917,7 @@ void unregister_ftrace_graph(void) goto out; ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_return = ftrace_graph_return_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a67f792fb950..17996354c745 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7026,7 +7026,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) } -static struct vfsmount *trace_automount(void *ingore) +static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; @@ -7039,7 +7039,7 @@ static struct vfsmount *trace_automount(void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + mnt = vfs_submount(mntpt, type, "tracefs", NULL); put_filesystem(type); if (IS_ERR(mnt)) return NULL; diff --git a/mm/filemap.c b/mm/filemap.c index 8309b5b773cd..1466db70dc42 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2245,7 +2245,7 @@ static struct page *wait_on_page_read(struct page *page) static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *, struct page *), + int (*filler)(struct file *, struct page *), void *data, gfp_t gfp) { @@ -2360,7 +2360,7 @@ out: */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *, struct page *), + int (*filler)(struct file *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -2382,7 +2382,7 @@ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { - filler_t *filler = (filler_t *)mapping->a_ops->readpage; + filler_t *filler = mapping->a_ops->readpage; return do_read_cache_page(mapping, index, filler, NULL, gfp); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd0c4bc00941..c7126875d8dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2087,8 +2087,9 @@ static void drain_pages(unsigned int cpu) * The CPU has to be pinned. When zone parameter is non-NULL, spill just * the single zone's pages. */ -void drain_local_pages(struct zone *zone) +void drain_local_pages(void *z) { + struct zone *zone = (struct zone *)z; int cpu = smp_processor_id(); if (zone) @@ -2148,8 +2149,7 @@ void drain_all_pages(struct zone *zone) else cpumask_clear_cpu(cpu, &cpus_with_pcps); } - on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, - zone, 1); + on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, zone, 1); } #ifdef CONFIG_HIBERNATION diff --git a/mm/readahead.c b/mm/readahead.c index fb1d210dbf05..3e25601ce14b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -81,7 +81,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, * Hides the details of the LRU cache etc from the filesystems. */ int read_cache_pages(struct address_space *mapping, struct list_head *pages, - int (*filler)(void *, struct page *), void *data) + int (*filler)(struct file *, struct page *), void *data) { struct page *page; int ret = 0; diff --git a/net/compat.c b/net/compat.c index 14459a87fdbc..53e58bfe0dc6 100644 --- a/net/compat.c +++ b/net/compat.c @@ -356,7 +356,8 @@ static int do_set_sock_timeout(struct socket *sock, int level, static int compat_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { - if (optname == SO_ATTACH_FILTER) + if (optname == SO_ATTACH_FILTER || + optname == SO_ATTACH_REUSEPORT_CBPF) return do_set_attach_filter(sock, level, optname, optval, optlen); if (!COMPAT_USE_64BIT_TIME && diff --git a/net/core/filter.c b/net/core/filter.c index fa6ec1310621..ebf7754810f0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1398,7 +1398,7 @@ BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; - if (unlikely(offset > 0xffff)) + if (unlikely(offset > INT_MAX)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; @@ -1433,7 +1433,7 @@ BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, { void *ptr; - if (unlikely(offset > 0xffff)) + if (unlikely(offset > INT_MAX)) goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); @@ -2594,6 +2594,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; default: return NULL; } @@ -2719,6 +2721,29 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + break; + default: + return false; + } + } + + if (off < 0 || off + size > sizeof(struct bpf_sock)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + return true; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -2977,6 +3002,30 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, + int dst_reg, int src_reg, + int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_sock, bound_dev_if): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + break; + } + + return insn - insn_buf; +} + static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -3050,6 +3099,12 @@ static const struct bpf_verifier_ops cg_skb_ops = { .convert_ctx_access = sk_filter_convert_ctx_access, }; +static const struct bpf_verifier_ops cg_sock_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, + .convert_ctx_access = sock_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3075,6 +3130,11 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = { .type = BPF_PROG_TYPE_CGROUP_SKB, }; +static struct bpf_prog_type_list cg_sock_type __read_mostly = { + .ops = &cg_sock_ops, + .type = BPF_PROG_TYPE_CGROUP_SOCK +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); @@ -3082,6 +3142,7 @@ static int __init register_sk_filter_ops(void) bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&cg_sock_type); return 0; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b7734f91abd6..15a7a2667e1e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3105,6 +3105,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int pos; int dummy; + if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && + (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { + /* gso_size is untrusted, and we have a frag_list with a linear + * non head_frag head. + * + * (we assume checking the first list_skb member suffices; + * i.e if either of the list_skb members have non head_frag + * head, then the first one has too). + * + * If head_skb's headlen does not fit requested gso_size, it + * means that the frag_list members do NOT terminate on exact + * gso_size boundaries. Hence we cannot perform skb_frag_t page + * sharing. Therefore we must fallback to copying the frag_list + * skbs; we do so by disabling SG. + */ + if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) + features &= ~NETIF_F_SG; + } + __skb_push(head_skb, doffset); proto = skb_network_protocol(head_skb, &dummy); if (unlikely(!proto)) @@ -3122,9 +3141,13 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int hsize; int size; - len = head_skb->len - offset; - if (len > mss) - len = mss; + if (unlikely(mss == GSO_BY_FRAGS)) { + len = list_skb->len; + } else { + len = head_skb->len - offset; + if (len > mss) + len = mss; + } hsize = skb_headlen(head_skb) - offset; if (hsize < 0) diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index ae0969c0fc2e..2ba3ae7720a8 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -36,9 +36,14 @@ int reuseport_alloc(struct sock *sk) * soft irq of receive path or setsockopt from process context */ spin_lock_bh(&reuseport_lock); - WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)), - "multiple allocations for the same socket"); + + /* Allocation attempts can occur concurrently via the setsockopt path + * and the bind/hash path. Nothing to do when we lose the race. + */ + if (rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock))) + goto out; + reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { spin_unlock_bh(&reuseport_lock); @@ -49,6 +54,7 @@ int reuseport_alloc(struct sock *sk) reuse->num_socks = 1; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); +out: spin_unlock_bh(&reuseport_lock); return 0; @@ -87,22 +93,42 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) return more_reuse; } +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + if (reuse->prog) + bpf_prog_destroy(reuse->prog); + kfree(reuse); +} + /** * reuseport_add_sock - Add a socket to the reuseport group of another. * @sk: New socket to add to the group. * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, const struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2) { - struct sock_reuseport *reuse; + struct sock_reuseport *old_reuse, *reuse; + + if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { + int err = reuseport_alloc(sk2); + + if (err) + return err; + } spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)), - WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)), - "socket already in reuseport group"); + lockdep_is_held(&reuseport_lock)); + old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_socks != 1) { + spin_unlock_bh(&reuseport_lock); + return -EBUSY; + } if (reuse->num_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); @@ -120,20 +146,12 @@ int reuseport_add_sock(struct sock *sk, const struct sock *sk2) spin_unlock_bh(&reuseport_lock); + if (old_reuse) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } EXPORT_SYMBOL(reuseport_add_sock); -static void reuseport_free_rcu(struct rcu_head *head) -{ - struct sock_reuseport *reuse; - - reuse = container_of(head, struct sock_reuseport, rcu); - if (reuse->prog) - bpf_prog_destroy(reuse->prog); - kfree(reuse); -} - void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; @@ -173,7 +191,7 @@ static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, /* temporarily advance data past protocol header */ if (!pskb_pull(skb, hdr_len)) { - consume_skb(nskb); + kfree_skb(nskb); return NULL; } index = bpf_prog_run_save_cb(prog, skb); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 3f51280374f0..ac0e8152c0f1 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -804,7 +804,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) } lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, + sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index bb1a7405dc0e..0958721e4d3d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -684,7 +684,7 @@ static int dccp_v6_rcv(struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, + sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, inet6_iif(skb)); if (!sk) { @@ -1010,7 +1010,7 @@ static struct proto dccp_v6_prot = { .sendmsg = dccp_sendmsg, .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v6_do_rcv, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .accept = inet_csk_accept, .get_port = inet_csk_get_port, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 42ab1b61b513..6383627b783e 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -182,12 +182,14 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, static HLIST_HEAD(raw_head); static DEFINE_RWLOCK(raw_lock); -static void raw_hash(struct sock *sk) +static int raw_hash(struct sock *sk) { write_lock_bh(&raw_lock); sk_add_node(sk, &raw_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&raw_lock); + + return 0; } static void raw_unhash(struct sock *sk) @@ -462,12 +464,14 @@ static inline struct dgram_sock *dgram_sk(const struct sock *sk) return container_of(sk, struct dgram_sock, sk); } -static void dgram_hash(struct sock *sk) +static int dgram_hash(struct sock *sk) { write_lock_bh(&dgram_lock); sk_add_node(sk, &dgram_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&dgram_lock); + + return 0; } static void dgram_unhash(struct sock *sk) @@ -1034,8 +1038,13 @@ static int ieee802154_create(struct net *net, struct socket *sock, /* Checksums on by default */ sock_set_flag(sk, SOCK_ZAPPED); - if (sk->sk_prot->hash) - sk->sk_prot->hash(sk); + if (sk->sk_prot->hash) { + rc = sk->sk_prot->hash(sk); + if (rc) { + sk_common_release(sk); + goto out; + } + } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 39eeb5c2d23d..a35c252235ae 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -387,13 +387,27 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) + if (err) { + sk_common_release(sk); + goto out; + } + } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { sk_common_release(sk); + goto out; + } } out: return err; @@ -1159,8 +1173,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ - __sk_prot_rehash(sk); - return 0; + return __sk_prot_rehash(sk); } int inet_sk_rebuild_header(struct sock *sk) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4b50bcbc6100..f6f3ee843d12 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,6 +24,7 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> +#include <net/sock_reuseport.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || - (sk2->sk_state != TCP_TIME_WAIT && + rcu_access_pointer(sk->sk_reuseport_cb) || + (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || @@ -157,6 +159,7 @@ again: sk->sk_state != TCP_LISTEN) || (tb->fastreuseport > 0 && sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(tb->fastuid, uid))) && (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; @@ -225,15 +228,18 @@ tb_found: if (((tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN) || (tb->fastreuseport > 0 && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size == -1) { + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && smallest_size == -1) { goto success; } else { ret = 1; if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || (tb->fastreuseport > 0 && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && smallest_size != -1 && --attempts >= 0) { spin_unlock(&head->lock); goto again; @@ -755,6 +761,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); + int err = -EADDRINUSE; reqsk_queue_alloc(&icsk->icsk_accept_queue); @@ -772,13 +779,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog) inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); - return 0; + if (likely(!err)) + return 0; } sk->sk_state = TCP_CLOSE; - return -EADDRINUSE; + return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index fcb83b2a61f0..1582fe5b04c3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -370,18 +370,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net, struct sock *sk; if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else - sk = inet6_lookup(net, hashinfo, + sk = inet6_lookup(net, hashinfo, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 738cd5c822b1..b947e4c9be18 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,10 +20,12 @@ #include <linux/wait.h> #include <linux/vmalloc.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> static u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -206,6 +208,7 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif) @@ -215,6 +218,7 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; rcu_read_lock(); @@ -230,6 +234,15 @@ begin: if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -247,11 +260,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -450,32 +465,73 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); -void __inet_hash(struct sock *sk, struct sock *osk) +static int inet_reuseport_add_sock(struct sock *sk, + struct inet_listen_hashbucket *ilb, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; + struct sock *sk2; + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + + sk_nulls_for_each_rcu(sk2, node, &ilb->head) { + if (sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if && + inet_csk(sk2)->icsk_bind_hash == tb && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + saddr_same(sk, sk2, false)) + return reuseport_add_sock(sk, sk2); + } + + return reuseport_alloc(sk); +} + +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; + int err = 0; if (sk->sk_state != TCP_LISTEN) { inet_ehash_nolisten(sk, osk); - return; + return 0; } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; spin_lock(&ilb->lock); + if (sk->sk_reuseport) { + err = inet_reuseport_add_sock(sk, ilb, saddr_same); + if (err) + goto unlock; + } __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +unlock: spin_unlock(&ilb->lock); + + return err; } EXPORT_SYMBOL(__inet_hash); -void inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { + int err = 0; + if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk, NULL); + err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); local_bh_enable(); } + + return err; } EXPORT_SYMBOL_GPL(inet_hash); @@ -494,6 +550,8 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c3f808602b65..f56e803c07f9 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -145,10 +145,12 @@ fail: } EXPORT_SYMBOL_GPL(ping_get_port); -void ping_hash(struct sock *sk) +int ping_hash(struct sock *sk) { pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ + + return 0; } void ping_unhash(struct sock *sk) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 81ade975db5f..b75a24adb580 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; -void raw_hash_sk(struct sock *sk) +int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; @@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk) sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); + + return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e373dcdab5e2..4a9d8c117794 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -643,8 +643,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, - &tcp_hashinfo, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, + ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ @@ -1631,7 +1631,8 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->sacked = 0; lookup: - sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, + th->dest); if (!sk) goto no_tcp_socket; @@ -1754,7 +1755,8 @@ do_time_wait: switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), - &tcp_hashinfo, + &tcp_hashinfo, skb, + __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0a0d44c53373..4976a1c9835f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -143,10 +143,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned int log) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each(sk2, node, &hslot->head) { + sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && @@ -177,12 +176,11 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, bool match_wildcard)) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); - udp_portaddr_for_each_entry(sk2, node, &hslot2->head) { + udp_portaddr_for_each_entry(sk2, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && @@ -207,11 +205,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, bool match_wildcard)) { struct net *net = sock_net(sk); - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); struct sock *sk2; - sk_nulls_for_each(sk2, node, &hslot->head) { + sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_family == sk->sk_family && @@ -224,10 +221,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, } } - /* Initial allocation may have already happened via setsockopt */ - if (!rcu_access_pointer(sk->sk_reuseport_cb)) - return reuseport_alloc(sk); - return 0; + return reuseport_alloc(sk); } /** @@ -338,17 +332,23 @@ found: goto fail_unlock; } - sk_nulls_add_node_rcu(sk, &hslot->head); + sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &hslot2->head); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + else + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } + sock_set_flag(sk, SOCK_RCU_FREE); error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); @@ -361,8 +361,8 @@ EXPORT_SYMBOL(udp_lib_get_port); * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, - bool match_wildcard) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, + bool match_wildcard) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -498,34 +498,31 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, unsigned int slot2) + struct udp_hslot *hslot2, unsigned int slot2, + struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; u32 hash = 0; -begin: result = NULL; badness = 0; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score2(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, NULL, 0); - if (sk2) { - result = sk2; - goto found; - } + result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (result) + return result; matches = 1; } + badness = score; + result = sk; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -533,23 +530,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot2) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score2(result, net, saddr, sport, - daddr, hnum, dif) < badness)) { - sock_put(result); - goto begin; - } - } return result; } @@ -561,14 +541,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int dif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; u32 hash = 0; - rcu_read_lock(); if (hslot->count > 10) { hash2 = udp4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -578,7 +556,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); if (!result) { hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; @@ -588,33 +566,29 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); } - rcu_read_unlock(); return result; } begin: result = NULL; badness = 0; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; - } + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -622,25 +596,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score(result, net, saddr, hnum, sport, - daddr, dport, dif) < badness)) { - sock_put(result); - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); @@ -656,13 +611,24 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, udptable, skb); } +/* Must be called under rcu_read_lock(). + * Does increment socket refcount. + */ +#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, - &udp_table, NULL); + struct sock *sk; + + sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, + dif, &udp_table, NULL); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + return sk; } EXPORT_SYMBOL_GPL(udp4_lib_lookup); +#endif static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, @@ -764,7 +730,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk->sk_err = err; sk->sk_error_report(sk); out: - sock_put(sk); + return; } void udp_err(struct sk_buff *skb, u32 info) @@ -1482,13 +1448,13 @@ void udp_lib_unhash(struct sock *sk) spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (sk_nulls_del_node_init_rcu(sk)) { + if (sk_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); } @@ -1521,12 +1487,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) if (hslot2 != nhslot2) { spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); spin_lock(&nhslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &nhslot2->head); nhslot2->count++; spin_unlock(&nhslot2->lock); @@ -1704,35 +1670,6 @@ drop: return -1; } -static void flush_stack(struct sock **stack, unsigned int count, - struct sk_buff *skb, unsigned int final) -{ - unsigned int i; - struct sk_buff *skb1 = NULL; - struct sock *sk; - - for (i = 0; i < count; i++) { - sk = stack[i]; - if (likely(!skb1)) - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - - if (!skb1) { - atomic_inc(&sk->sk_drops); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - } - - if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) - skb1 = NULL; - - sock_put(sk); - } - if (unlikely(skb1)) - kfree_skb(skb1); -} - /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ @@ -1756,14 +1693,14 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udp_table *udptable, int proto) { - struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct hlist_nulls_node *node; + struct sock *sk, *first = NULL; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - int dif = skb->dev->ifindex; - unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - bool inner_flushed = false; + unsigned int offset = offsetof(typeof(*sk), sk_node); + int dif = skb->dev->ifindex; + struct hlist_node *node; + struct sk_buff *nskb; if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & @@ -1774,23 +1711,28 @@ start_lookup: offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } - spin_lock(&hslot->lock); - sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { - if (__udp_is_mcast_sock(net, sk, - uh->dest, daddr, - uh->source, saddr, - dif, hnum)) { - if (unlikely(count == ARRAY_SIZE(stack))) { - flush_stack(stack, count, skb, ~0); - inner_flushed = true; - count = 0; - } - stack[count++] = sk; - sock_hold(sk); + sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { + if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, + uh->source, saddr, dif, hnum)) + continue; + + if (!first) { + first = sk; + continue; } - } + nskb = skb_clone(skb, GFP_ATOMIC); - spin_unlock(&hslot->lock); + if (unlikely(!nskb)) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + continue; + } + if (udp_queue_rcv_skb(sk, nskb) > 0) + consume_skb(nskb); + } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { @@ -1798,16 +1740,13 @@ start_lookup: goto start_lookup; } - /* - * do the slow work with no lock held - */ - if (count) { - flush_stack(stack, count, skb, count - 1); + if (first) { + if (udp_queue_rcv_skb(first, skb) > 0) + consume_skb(skb); } else { - if (!inner_flushed) - UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); - consume_skb(skb); + kfree_skb(skb); + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -1912,7 +1851,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); - sock_put(sk); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 @@ -1973,49 +1911,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, int dif) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); - unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); + unsigned int slot = udp_hashfn(net, hnum, udp_table.mask); struct udp_hslot *hslot = &udp_table.hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) return NULL; - rcu_read_lock(); -begin: - count = 0; result = NULL; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { - if (__udp_is_mcast_sock(net, sk, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) { + sk_for_each_rcu(sk, &hslot->head) { + if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, + rmt_port, rmt_addr, dif, hnum)) { + if (result) + return NULL; result = sk; - ++count; - } - } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { - if (count != 1 || - unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!__udp_is_mcast_sock(net, result, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum))) { - sock_put(result); - result = NULL; } } - rcu_read_unlock(); + return result; } @@ -2028,37 +1941,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 rmt_port, __be32 rmt_addr, int dif) { - struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + struct sock *sk; - rcu_read_lock(); - result = NULL; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { - if (INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, ports, dif)) - result = sk; + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + if (INET_MATCH(sk, net, acookie, rmt_addr, + loc_addr, ports, dif)) + return sk; /* Only check first socket in chain */ break; } - - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, - ports, dif))) { - sock_put(result); - result = NULL; - } - } - rcu_read_unlock(); - return result; + return NULL; } void udp_v4_early_demux(struct sk_buff *skb) @@ -2066,7 +1964,7 @@ void udp_v4_early_demux(struct sk_buff *skb) struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct udphdr *uh; - struct sock *sk; + struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; int ours; @@ -2098,11 +1996,9 @@ void udp_v4_early_demux(struct sk_buff *skb) } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif); - } else { - return; } - if (!sk) + if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2)) return; skb->sk = sk; @@ -2395,7 +2291,6 @@ struct proto udp_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, @@ -2417,14 +2312,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) for (state->bucket = start; state->bucket <= state->udp_table->mask; ++state->bucket) { - struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == state->family) @@ -2443,7 +2337,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) struct net *net = seq_file_net(seq); do { - sk = sk_nulls_next(sk); + sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { @@ -2658,12 +2552,12 @@ void __init udp_table_init(struct udp_table *table, const char *name) table->hash2 = table->hash + (table->mask + 1); for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); + INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); + INIT_HLIST_HEAD(&table->hash2[i].head); table->hash2[i].count = 0; spin_lock_init(&table->hash2[i].lock); } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 6510e961aaa6..9a89c10a55f0 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -36,10 +36,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { int err = -EINVAL; - struct sock *sk; + struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); + rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, @@ -54,9 +55,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_dport, req->id.idiag_if, tbl, NULL); #endif - else - goto out_nosk; - + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; @@ -97,25 +98,24 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { - int num, s_num, slot, s_slot; - struct net *net = sock_net(skb->sk); bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { - struct sock *sk; - struct hlist_nulls_node *node; struct udp_hslot *hslot = &table->hash[slot]; + struct sock *sk; num = 0; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 78766b32b78b..705d9fbf0bcf 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -55,7 +55,6 @@ struct proto udplite_prot = { .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, .obj_size = sizeof(struct udp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8e38e058a66c..3cd9ad455910 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -250,7 +250,11 @@ lookup_protocol: * creation time automatically shares. */ inet->inet_sport = htons(inet->inet_num); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); @@ -259,6 +263,14 @@ lookup_protocol: goto out; } } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { + sk_common_release(sk); + goto out; + } + } out: return err; out_rcu_unlock: @@ -277,6 +289,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); __be32 v4addr = 0; unsigned short snum; + bool saved_ipv6only; int addr_type = 0; int err = 0; @@ -396,19 +409,21 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; + saved_ipv6only = sk->sk_ipv6only; + if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED) + sk->sk_ipv6only = 1; + /* Make sure we are allowed to bind here. */ if ((snum || !inet->bind_address_no_port) && sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); err = -EADDRINUSE; goto out; } - if (addr_type != IPV6_ADDR_ANY) { + if (addr_type != IPV6_ADDR_ANY) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; - if (addr_type != IPV6_ADDR_MAPPED) - sk->sk_ipv6only = 1; - } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 908525ae5d19..c7ce585ac17e 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -26,6 +26,7 @@ #include <net/ip6_route.h> #include <net/sock.h> #include <net/inet6_connection_sock.h> +#include <net/sock_reuseport.h> int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax) @@ -48,6 +49,7 @@ int inet6_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid((struct sock *)sk2))))) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 21ace5a2bf7c..c27207563d10 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -17,11 +17,13 @@ #include <linux/module.h> #include <linux/random.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -121,7 +123,9 @@ static inline int compute_score(struct sock *sk, struct net *net, } struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { @@ -129,6 +133,7 @@ struct sock *inet6_lookup_listener(struct net *net, const struct hlist_nulls_node *node; struct sock *result; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; @@ -146,6 +151,15 @@ begin: if (reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -163,11 +177,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -177,6 +193,7 @@ begin: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif) @@ -184,7 +201,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sock *sk; local_bh_disable(); - sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif); + sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + ntohs(dport), dif); local_bh_enable(); return sk; @@ -274,3 +292,61 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); + +int inet6_hash(struct sock *sk) +{ + int err = 0; + + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal); + local_bh_enable(); + } + + return err; +} +EXPORT_SYMBOL_GPL(inet6_hash); + +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; + } + + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && match_wildcard && + !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e2fe07f7ded9..50091116fb55 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -895,7 +895,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), - &tcp_hashinfo, &ipv6h->saddr, + &tcp_hashinfo, NULL, 0, + &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), tcp_v6_iif(skb)); if (!sk1) @@ -1420,8 +1421,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, - inet6_iif(skb)); + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), + th->source, th->dest, inet6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1552,6 +1553,7 @@ do_time_wait: struct sock *sk2; sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif(skb)); @@ -1921,7 +1923,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b345b37edd4f..ce3a100cbf68 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> +#include <net/addrconf.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> @@ -78,49 +79,6 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 - * only, and any IPv4 addresses if not IPv6 only - * match_wildcard == false: addresses must be exactly the same, i.e. - * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, - * and 0.0.0.0 equals to 0.0.0.0 only - */ -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) -{ - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - - /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { - if (!sk2_ipv6only) { - if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) - return 1; - if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) - return match_wildcard; - } - return 0; - } - - if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; - - if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && - !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; - - if (addr_type == IPV6_ADDR_ANY && match_wildcard && - !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; - - if (sk2_rcv_saddr6 && - ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) - return 1; - - return 0; -} - static u32 udp6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) @@ -252,34 +210,32 @@ static inline int compute_score2(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, unsigned int slot2) + struct udp_hslot *hslot2, unsigned int slot2, + struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; u32 hash = 0; -begin: result = NULL; badness = -1; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score2(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, NULL, 0); - if (sk2) { - result = sk2; - goto found; - } + + result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -287,27 +243,10 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot2) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score2(result, net, saddr, sport, - daddr, hnum, dif) < badness)) { - sock_put(result); - goto begin; - } - } return result; } +/* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, @@ -315,14 +254,12 @@ struct sock *__udp6_lib_lookup(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; u32 hash = 0; - rcu_read_lock(); if (hslot->count > 10) { hash2 = udp6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -332,7 +269,7 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); if (!result) { hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; @@ -342,32 +279,28 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); } - rcu_read_unlock(); return result; } begin: result = NULL; badness = -1; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; - } + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -375,25 +308,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, saddr, sport, - daddr, dport, dif) < badness)) { - sock_put(result); - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__udp6_lib_lookup); @@ -413,12 +327,24 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, udptable, skb); } +/* Must be called under rcu_read_lock(). + * Does increment socket refcount. + */ +#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { - return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL); + struct sock *sk; + + sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, + dif, &udp_table, NULL); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + return sk; } EXPORT_SYMBOL_GPL(udp6_lib_lookup); +#endif /* * This should be easy, if there is something there we @@ -584,7 +510,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct net *net = dev_net(skb->dev); sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, - inet6_iif(skb), udptable, skb); + inet6_iif(skb), udptable, NULL); if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); @@ -615,7 +541,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk->sk_err = err; sk->sk_error_report(sk); out: - sock_put(sk); + return; } int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -775,33 +701,6 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, return true; } -static void flush_stack(struct sock **stack, unsigned int count, - struct sk_buff *skb, unsigned int final) -{ - struct sk_buff *skb1 = NULL; - struct sock *sk; - unsigned int i; - - for (i = 0; i < count; i++) { - sk = stack[i]; - if (likely(!skb1)) - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - if (!skb1) { - atomic_inc(&sk->sk_drops); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - } - - if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) - skb1 = NULL; - sock_put(sk); - } - if (unlikely(skb1)) - kfree_skb(skb1); -} - static void udp6_csum_zero_error(struct sk_buff *skb) { /* RFC 2460 section 8.1 says that we SHOULD log @@ -820,15 +719,15 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, struct udp_table *udptable, int proto) { - struct sock *sk, *stack[256 / sizeof(struct sock *)]; + struct sock *sk, *first = NULL; const struct udphdr *uh = udp_hdr(skb); - struct hlist_nulls_node *node; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - int dif = inet6_iif(skb); - unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - bool inner_flushed = false; + int dif = inet6_iif(skb); + struct hlist_node *node; + struct sk_buff *nskb; if (use_hash2) { hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & @@ -839,27 +738,32 @@ start_lookup: offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } - spin_lock(&hslot->lock); - sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { - if (__udp_v6_is_mcast_sock(net, sk, - uh->dest, daddr, - uh->source, saddr, - dif, hnum) && - /* If zero checksum and no_check is not on for - * the socket then skip it. - */ - (uh->check || udp_sk(sk)->no_check6_rx)) { - if (unlikely(count == ARRAY_SIZE(stack))) { - flush_stack(stack, count, skb, ~0); - inner_flushed = true; - count = 0; - } - stack[count++] = sk; - sock_hold(sk); + sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { + if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, + uh->source, saddr, dif, hnum)) + continue; + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + if (!uh->check && !udp_sk(sk)->no_check6_rx) + continue; + if (!first) { + first = sk; + continue; + } + nskb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!nskb)) { + atomic_inc(&sk->sk_drops); + UDP6_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + continue; } - } - spin_unlock(&hslot->lock); + if (udpv6_queue_rcv_skb(sk, nskb) > 0) + consume_skb(nskb); + } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { @@ -867,13 +771,13 @@ start_lookup: goto start_lookup; } - if (count) { - flush_stack(stack, count, skb, count - 1); + if (first) { + if (udpv6_queue_rcv_skb(first, skb) > 0) + consume_skb(skb); } else { - if (!inner_flushed) - UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); - consume_skb(skb); + kfree_skb(skb); + UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -881,10 +785,10 @@ start_lookup: int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { + const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); - struct sock *sk; struct udphdr *uh; - const struct in6_addr *saddr, *daddr; + struct sock *sk; u32 ulen = 0; if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -938,7 +842,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int ret; if (!uh->check && !udp_sk(sk)->no_check6_rx) { - sock_put(sk); udp6_csum_zero_error(skb); goto csum_error; } @@ -948,7 +851,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); - sock_put(sk); /* a return value > 0 means to resubmit the input */ if (ret > 0) @@ -995,7 +897,6 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, int dif) { struct sock *sk; - struct hlist_nulls_node *hnode; unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; @@ -1003,7 +904,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); - udp_portaddr_for_each_entry_rcu(sk, hnode, &hslot2->head) { + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (sk->sk_state == TCP_ESTABLISHED && INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif)) return sk; @@ -1648,7 +1549,6 @@ struct proto udpv6_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index d1eaeeaa34d2..af2895c77ed6 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -50,7 +50,6 @@ struct proto udplitev6_prot = { .unhash = udp_lib_unhash, .get_port = udp_v6_get_port, .obj_size = sizeof(struct udp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index aed0c7350988..3a31370d568a 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -212,13 +212,14 @@ discard: return 0; } -static void l2tp_ip_hash(struct sock *sk) +static int l2tp_ip_hash(struct sock *sk) { if (sk_unhashed(sk)) { write_lock_bh(&l2tp_ip_lock); sk_add_node(sk, &l2tp_ip_table); write_unlock_bh(&l2tp_ip_lock); } + return 0; } static void l2tp_ip_unhash(struct sock *sk) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index bcb7bce86d98..8cc6a554b6f8 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -225,13 +225,14 @@ discard: return 0; } -static void l2tp_ip6_hash(struct sock *sk) +static int l2tp_ip6_hash(struct sock *sk) { if (sk_unhashed(sk)) { write_lock_bh(&l2tp_ip6_lock); sk_add_node(sk, &l2tp_ip6_table); write_unlock_bh(&l2tp_ip6_lock); } + return 0; } static void l2tp_ip6_unhash(struct sock *sk) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2ed50ba6c1af..d32a3fd4528e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1600,7 +1600,7 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) } EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); -int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) { int i, bucket, rc; unsigned int hashsize, old_size; diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index e84a578dbe35..d76afafdc699 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -134,7 +134,7 @@ static int __init nf_nat_ftp_init(void) } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) +static int warn_set(const char *val, const struct kernel_param *kp) { printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index 1fb2258c3535..8039bcd60758 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -107,7 +107,7 @@ static int __init nf_nat_irc_init(void) } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) +static int warn_set(const char *val, const struct kernel_param *kp) { printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 3ab591e73ec0..7f4414d26a66 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -105,19 +105,24 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) * belonging to established connections going through that one. */ static inline struct sock * -nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet_lookup_listener(net, &tcp_hashinfo, skb, + ip_hdrlen(skb) + + __tcp_hdrlen(tcph), saddr, sport, daddr, dport, in->ifindex); @@ -169,19 +174,23 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, #ifdef XT_TPROXY_HAVE_IPV6 static inline struct sock * -nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, + thoff + __tcp_hdrlen(tcph), saddr, sport, daddr, ntohs(dport), in->ifindex); @@ -267,7 +276,7 @@ tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); @@ -305,7 +314,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NFT_LOOKUP_ESTABLISHED); @@ -321,7 +330,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NFT_LOOKUP_LISTENER); @@ -429,7 +438,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(par->net, tproto, + sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), hp->source, @@ -472,7 +481,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, par->in, NFT_LOOKUP_ESTABLISHED); @@ -487,8 +496,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, - &iph->saddr, laddr, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, + tproto, &iph->saddr, laddr, hp->source, lport, par->in, NFT_LOOKUP_LISTENER); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 939821821fcb..c5d930670cf0 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -112,14 +112,15 @@ extract_icmp4_fields(const struct sk_buff *skb, * box. */ static struct sock * -xt_socket_get_sock_v4(struct net *net, const u8 protocol, +xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return __inet_lookup(net, &tcp_hashinfo, + return __inet_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -148,6 +149,8 @@ struct sock *xt_socket_lookup_slow_v4(struct net *net, const struct net_device *indev) { const struct iphdr *iph = ip_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; struct sock *sk = skb->sk; __be32 uninitialized_var(daddr), uninitialized_var(saddr); __be16 uninitialized_var(dport), uninitialized_var(sport); @@ -173,6 +176,10 @@ struct sock *xt_socket_lookup_slow_v4(struct net *net, sport = hp->source; daddr = iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = iph->protocol == IPPROTO_TCP ? + ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) : + ip_hdrlen(skb) + sizeof(*hp); } else if (iph->protocol == IPPROTO_ICMP) { if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, @@ -205,9 +212,9 @@ struct sock *xt_socket_lookup_slow_v4(struct net *net, if (sk) atomic_inc(&sk->sk_refcnt); else - sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol, - saddr, daddr, sport, dport, - indev); + sk = xt_socket_get_sock_v4(dev_net(skb->dev), data_skb, doff, + protocol, saddr, daddr, sport, + dport, indev); return sk; } @@ -328,14 +335,15 @@ extract_icmp6_fields(const struct sk_buff *skb, } static struct sock * -xt_socket_get_sock_v6(struct net *net, const u8 protocol, +xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, &tcp_hashinfo, + return inet6_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -354,6 +362,8 @@ struct sock *xt_socket_lookup_slow_v6(struct net *net, __be16 uninitialized_var(dport), uninitialized_var(sport); const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; int thoff = 0, tproto; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); @@ -375,6 +385,10 @@ struct sock *xt_socket_lookup_slow_v6(struct net *net, sport = hp->source; daddr = &iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = tproto == IPPROTO_TCP ? + thoff + __tcp_hdrlen((struct tcphdr *)hp) : + thoff + sizeof(*hp); } else if (tproto == IPPROTO_ICMPV6) { struct ipv6hdr ipv6_var; @@ -389,8 +403,8 @@ struct sock *xt_socket_lookup_slow_v6(struct net *net, if (sk) atomic_inc(&sk->sk_refcnt); else - sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto, - saddr, daddr, sport, dport, + sk = xt_socket_get_sock_v6(dev_net(skb->dev), data_skb, doff, + tproto, saddr, daddr, sport, dport, indev); return sk; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index d575ef4e9aa6..ffd5f2297584 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -140,13 +140,15 @@ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb) rcu_read_unlock(); } -void pn_sock_hash(struct sock *sk) +int pn_sock_hash(struct sock *sk) { struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject); mutex_lock(&pnsocks.lock); sk_add_node_rcu(sk, hlist); mutex_unlock(&pnsocks.lock); + + return 0; } EXPORT_SYMBOL(pn_sock_hash); @@ -200,7 +202,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) pn->resource = spn->spn_resource; /* Enable RX on the socket */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); out_port: mutex_unlock(&port_mutex); out: diff --git a/net/sctp/socket.c b/net/sctp/socket.c index fcac88f1774b..4d60f1e42f42 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6171,9 +6171,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, return retval; } -static void sctp_hash(struct sock *sk) +static int sctp_hash(struct sock *sk) { /* STUB */ + return 0; } static void sctp_unhash(struct sock *sk) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 41f6e964fe91..1a7d930a867c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -50,7 +50,7 @@ EXPORT_SYMBOL_GPL(svc_pool_map); static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int -param_set_pool_mode(const char *val, struct kernel_param *kp) +param_set_pool_mode(const char *val, const struct kernel_param *kp) { int *ip = (int *)kp->arg; struct svc_pool_map *m = &svc_pool_map; @@ -80,7 +80,7 @@ out: } static int -param_get_pool_mode(char *buf, struct kernel_param *kp) +param_get_pool_mode(char *buf, const struct kernel_param *kp) { int *ip = (int *)kp->arg; diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index 2e69bf6da207..80cfd227c984 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -37,7 +37,9 @@ static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) = (void *) BPF_FUNC_clone_redirect; static int (*bpf_redirect)(int ifindex, int flags) = (void *) BPF_FUNC_redirect; -static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, int size) = +static int (*bpf_perf_event_output)(void *ctx, void *map, + unsigned long long flags, void *data, + int size) = (void *) BPF_FUNC_perf_event_output; static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = (void *) BPF_FUNC_get_stackid; diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 1112dd94b440..8a491e6b542c 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -148,6 +148,37 @@ cc-disable-warning = $(call try-run,\ # Expands to either gcc or clang cc-name = $(shell $(CC) -v 2>&1 | grep -q "clang version" && echo clang || echo gcc) +# __cc-version +# Returns compiler version +__cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/$(cc-name)-version.sh $(CC)) + +# __cc-fullversion +# Returns full compiler version +__cc-fullversion = $(shell $(CONFIG_SHELL) \ + $(srctree)/scripts/$(cc-name)-version.sh -p $(CC)) + +# __cc-ifversion +# Matches compiler name and version +# Usage: EXTRA_CFLAGS += $(call cc-if-name-version, gcc, -lt, 0402, -O1) +__cc-ifversion = $(shell [ $(cc-name) = $(1) ] && [ $(__cc-version) $(2) $(3) ] && echo $(4) || echo $(5)) + +# __cc-if-fullversion +# Matches compiler name and full version +# Usage: EXTRA_CFLAGS += $(call cc-if-name-fullversion, gcc, -lt, 040502, -O1) +__cc-if-fullversion = $(shell [ $(cc-name) = $(1) ] && [ $(__cc-fullversion) $(2) $(3) ] && echo $(4) || echo $(5)) + +# gcc-ifversion +gcc-ifversion = $(call __cc-ifversion, gcc, $(1), $(2), $(3), $(4)) + +# gcc-if-fullversion +gcc-if-fullversion = (call __cc-if-fullversion, gcc, $(1), $(2), $(3), $(4)) + +# clang-ifversion +clang-ifversion = $(call __cc-ifversion, clang, $(1), $(2), $(3), $(4)) + +# clang-if-fullversion +clang-if-fullversion = (call __cc-if-fullversion, clang, $(1), $(2), $(3), $(4)) + # cc-version cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC)) @@ -155,9 +186,9 @@ cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC)) cc-fullversion = $(shell $(CONFIG_SHELL) \ $(srctree)/scripts/gcc-version.sh -p $(CC)) -# cc-ifversion -# Usage: EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1) -cc-ifversion = $(shell [ $(cc-version) $(1) $(2) ] && echo $(3) || echo $(4)) +# backward compatibility +cc-ifversion = $(gcc-ifversion) +cc-if-fullversion = $(gcc-if-fullversion) # cc-ldoption # Usage: ldflags += $(call cc-ldoption, -Wl$(comma)--hash-style=both) @@ -165,14 +196,18 @@ cc-ldoption = $(call try-run,\ $(CC) $(1) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -nostdlib -x c /dev/null -o "$$TMP",$(1),$(2)) # ld-option -# Usage: LDFLAGS += $(call ld-option, -X) -ld-option = $(call try-run, $(LD) $(LDFLAGS) $(1) -v,$(1),$(2)) +# Usage: LDFLAGS += $(call ld-option, -X, -Y) +ld-option = $(call try-run, $(LD) $(LDFLAGS) $(1) -v,$(1),$(2),$(3)) # ar-option # Usage: KBUILD_ARFLAGS := $(call ar-option,D) # Important: no spaces around options ar-option = $(call try-run, $(AR) rc$(1) "$$TMP",$(1),$(2)) +# ld-name +# Expands to either bfd or gold +ld-name = $(shell $(LD) -v 2>&1 | grep -q "GNU gold" && echo gold || echo bfd) + # ld-version # Note this is mainly for HJ Lu's 3 number binutil versions ld-version = $(shell $(LD) --version | $(srctree)/scripts/ld-version.sh) @@ -181,6 +216,18 @@ ld-version = $(shell $(LD) --version | $(srctree)/scripts/ld-version.sh) # Usage: $(call ld-ifversion, -ge, 22252, y) ld-ifversion = $(shell [ $(ld-version) $(1) $(2) ] && echo $(3) || echo $(4)) +# __ld-ifversion +# Usage: $(call __ld-ifversion, gold, -ge, 112000000, y) +__ld-ifversion = $(shell [ $(ld-name) = $(1) ] && [ $(ld-version) $(2) $(3) ] && echo $(4) || echo $(5)) + +# bfd-ifversion +# Usage: $(call bfd-ifversion, -ge, 227000000, y) +bfd-ifversion = $(call __ld-ifversion, bfd, $(1), $(2), $(3), $(4)) + +# gold-ifversion +# Usage: $(call gold-ifversion, -ge, 112000000, y) +gold-ifversion = $(call __ld-ifversion, gold, $(1), $(2), $(3), $(4)) + ###### ### diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 726d1b3f6759..23d49459ee66 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -280,6 +280,13 @@ define rule_as_o_S mv -f $(dot-target).tmp $(dot-target).cmd endef +# List module undefined symbols (or empty line if not enabled) +ifdef CONFIG_TRIM_UNUSED_KSYMS +cmd_undef_syms = $(NM) $@ | sed -n 's/^ \+U //p' | xargs echo +else +cmd_undef_syms = echo +endif + # Built-in and composite module parts $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE $(call cmd,force_checksrc) @@ -290,7 +297,8 @@ $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE $(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE $(call cmd,force_checksrc) $(call if_changed_rule,cc_o_c) - @{ echo $(@:.o=.ko); echo $@; } > $(MODVERDIR)/$(@F:.o=.mod) + @{ echo $(@:.o=.ko); echo $@; \ + $(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod) quiet_cmd_cc_lst_c = MKLST $@ cmd_cc_lst_c = $(CC) $(c_flags) -g -c -o $*.o $< && \ @@ -415,12 +423,22 @@ $(sort $(subdir-obj-y)): $(subdir-ym) ; # Rule to compile a set of .o files into one .o file # ifdef builtin-target -quiet_cmd_link_o_target = LD $@ + +ifdef CONFIG_THIN_ARCHIVES + cmd_make_builtin = rm -f $@; $(AR) rcSTP$(KBUILD_ARFLAGS) + cmd_make_empty_builtin = rm -f $@; $(AR) rcSTP$(KBUILD_ARFLAGS) + quiet_cmd_link_o_target = AR $@ +else + cmd_make_builtin = $(LD) $(ld_flags) -r -o + cmd_make_empty_builtin = rm -f $@; $(AR) rcs$(KBUILD_ARFLAGS) + quiet_cmd_link_o_target = LD $@ +endif + # If the list of objects to link is empty, just create an empty built-in.o cmd_link_o_target = $(if $(strip $(obj-y)),\ - $(LD) $(ld_flags) -r -o $@ $(filter $(obj-y), $^) \ + $(cmd_make_builtin) $@ $(filter $(obj-y), $^) \ $(cmd_secanalysis),\ - rm -f $@; $(AR) rcs$(KBUILD_ARFLAGS) $@) + $(cmd_make_empty_builtin) $@) $(builtin-target): $(obj-y) FORCE $(call if_changed,link_o_target) @@ -446,7 +464,12 @@ $(modorder-target): $(subdir-ym) FORCE # ifdef lib-target quiet_cmd_link_l_target = AR $@ -cmd_link_l_target = rm -f $@; $(AR) rcs$(KBUILD_ARFLAGS) $@ $(lib-y) + +ifdef CONFIG_THIN_ARCHIVES + cmd_link_l_target = rm -f $@; $(AR) rcsTP$(KBUILD_ARFLAGS) $@ $(lib-y) +else + cmd_link_l_target = rm -f $@; $(AR) rcs$(KBUILD_ARFLAGS) $@ $(lib-y) +endif $(lib-target): $(lib-y) FORCE $(call if_changed,link_l_target) @@ -461,16 +484,27 @@ endif # <composite-object>-objs := <list of .o files> # or # <composite-object>-y := <list of .o files> +# or +# <composite-object>-m := <list of .o files> +# The -m syntax only works if <composite object> is a module link_multi_deps = \ $(filter $(addprefix $(obj)/, \ $($(subst $(obj)/,,$(@:.o=-objs))) \ -$($(subst $(obj)/,,$(@:.o=-y)))), $^) +$($(subst $(obj)/,,$(@:.o=-y))) \ +$($(subst $(obj)/,,$(@:.o=-m)))), $^) -quiet_cmd_link_multi-y = LD $@ -cmd_link_multi-y = $(LD) $(ld_flags) -r -o $@ $(link_multi_deps) $(cmd_secanalysis) +cmd_link_multi-link = $(LD) $(ld_flags) -r -o $@ $(link_multi_deps) $(cmd_secanalysis) + +ifdef CONFIG_THIN_ARCHIVES + quiet_cmd_link_multi-y = AR $@ + cmd_link_multi-y = rm -f $@; $(AR) rcSTP$(KBUILD_ARFLAGS) $@ $(link_multi_deps) +else + quiet_cmd_link_multi-y = LD $@ + cmd_link_multi-y = $(cmd_link_multi-link) +endif quiet_cmd_link_multi-m = LD [M] $@ -cmd_link_multi-m = $(cmd_link_multi-y) +cmd_link_multi-m = $(cmd_link_multi-link) $(multi-used-y): FORCE $(call if_changed,link_multi-y) @@ -478,8 +512,9 @@ $(call multi_depend, $(multi-used-y), .o, -objs -y) $(multi-used-m): FORCE $(call if_changed,link_multi-m) - @{ echo $(@:.o=.ko); echo $(link_multi_deps); } > $(MODVERDIR)/$(@F:.o=.mod) -$(call multi_depend, $(multi-used-m), .o, -objs -y) + @{ echo $(@:.o=.ko); echo $(link_multi_deps); \ + $(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod) +$(call multi_depend, $(multi-used-m), .o, -objs -y -m) targets += $(multi-used-y) $(multi-used-m) diff --git a/scripts/Makefile.host b/scripts/Makefile.host index 133edfae5b8a..a89a0c5d43b2 100644 --- a/scripts/Makefile.host +++ b/scripts/Makefile.host @@ -85,7 +85,7 @@ hostcxx_flags = -Wp,-MD,$(depfile) $(__hostcxx_flags) # Create executable from a single .c file # host-csingle -> Executable quiet_cmd_host-csingle = HOSTCC $@ - cmd_host-csingle = $(HOSTCC) $(hostc_flags) -o $@ $< \ + cmd_host-csingle = $(HOSTCC) $(hostc_flags) $(HOSTLDFLAGS) -o $@ $< \ $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F)) $(host-csingle): $(obj)/%: $(src)/%.c FORCE $(call if_changed_dep,host-csingle) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 5a93712ea570..2daf8d6e49a1 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -48,7 +48,7 @@ subdir-ym := $(sort $(subdir-y) $(subdir-m)) # if $(foo-objs) exists, foo.o is a composite object multi-used-y := $(sort $(foreach m,$(obj-y), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y))), $(m)))) -multi-used-m := $(sort $(foreach m,$(obj-m), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y))), $(m)))) +multi-used-m := $(sort $(foreach m,$(obj-m), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m))), $(m)))) multi-used := $(multi-used-y) $(multi-used-m) single-used-m := $(sort $(filter-out $(multi-used-m),$(obj-m))) @@ -67,7 +67,7 @@ obj-dirs := $(dir $(multi-objs) $(obj-y)) # Replace multi-part objects by their individual parts, look at local dir only real-objs-y := $(foreach m, $(filter-out $(subdir-obj-y), $(obj-y)), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y))),$($(m:.o=-objs)) $($(m:.o=-y)),$(m))) $(extra-y) -real-objs-m := $(foreach m, $(obj-m), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y))),$($(m:.o=-objs)) $($(m:.o=-y)),$(m))) +real-objs-m := $(foreach m, $(obj-m), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m))),$($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m)),$(m))) # Add subdir path diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 7718a64b1cd1..8cb7971b3f25 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -115,14 +115,18 @@ $(modules:.ko=.mod.o): %.mod.o: %.mod.c FORCE targets += $(modules:.ko=.mod.o) -# Step 6), final link of the modules +ARCH_POSTLINK := $(wildcard $(srctree)/arch/$(SRCARCH)/Makefile.postlink) + +# Step 6), final link of the modules with optional arch pass after final link quiet_cmd_ld_ko_o = LD [M] $@ - cmd_ld_ko_o = $(LD) -r $(LDFLAGS) \ - $(KBUILD_LDFLAGS_MODULE) $(LDFLAGS_MODULE) \ - -o $@ $(filter-out FORCE,$^) + cmd_ld_ko_o = \ + $(LD) -r $(LDFLAGS) \ + $(KBUILD_LDFLAGS_MODULE) $(LDFLAGS_MODULE) \ + -o $@ $(filter-out FORCE,$^) ; \ + $(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true) $(modules): %.ko :%.o %.mod.o FORCE - $(call if_changed,ld_ko_o) + +$(call if_changed,ld_ko_o) targets += $(modules) diff --git a/scripts/clang-version.sh b/scripts/clang-version.sh new file mode 100755 index 000000000000..9780efa56980 --- /dev/null +++ b/scripts/clang-version.sh @@ -0,0 +1,33 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# clang-version [-p] clang-command +# +# Prints the compiler version of `clang-command' in a canonical 4-digit form +# such as `0500' for clang-5.0 etc. +# +# With the -p option, prints the patchlevel as well, for example `050001' for +# clang-5.0.1 etc. +# + +if [ "$1" = "-p" ] ; then + with_patchlevel=1; + shift; +fi + +compiler="$*" + +if [ ${#compiler} -eq 0 ]; then + echo "Error: No compiler specified." + printf "Usage:\n\t$0 <clang-command>\n" + exit 1 +fi + +MAJOR=$(echo __clang_major__ | $compiler -E -x c - | tail -n 1) +MINOR=$(echo __clang_minor__ | $compiler -E -x c - | tail -n 1) +if [ "x$with_patchlevel" != "x" ] ; then + PATCHLEVEL=$(echo __clang_patchlevel__ | $compiler -E -x c - | tail -n 1) + printf "%02d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL +else + printf "%02d%02d\\n" $MAJOR $MINOR +fi diff --git a/scripts/extract-cert.c b/scripts/extract-cert.c index b071bf476fea..dd1a4bd706a2 100644 --- a/scripts/extract-cert.c +++ b/scripts/extract-cert.c @@ -23,6 +23,13 @@ #include <openssl/err.h> #include <openssl/engine.h> +/* + * OpenSSL 3.0 deprecates the OpenSSL's ENGINE API. + * + * Remove this if/when that API is no longer used + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + #define PKEY_ID_PKCS7 2 static __attribute__((noreturn)) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index ba6c34ea5429..0a08006d600a 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -3,9 +3,12 @@ # link vmlinux # # vmlinux is linked from the objects selected by $(KBUILD_VMLINUX_INIT) and -# $(KBUILD_VMLINUX_MAIN). Most are built-in.o files from top-level directories -# in the kernel tree, others are specified in arch/$(ARCH)/Makefile. -# Ordering when linking is important, and $(KBUILD_VMLINUX_INIT) must be first. +# $(KBUILD_VMLINUX_MAIN) and $(KBUILD_VMLINUX_LIBS). Most are built-in.o files +# from top-level directories in the kernel tree, others are specified in +# arch/$(ARCH)/Makefile. Ordering when linking is important, and +# $(KBUILD_VMLINUX_INIT) must be first. $(KBUILD_VMLINUX_LIBS) are archives +# which are linked conditionally (not within --whole-archive), and do not +# require symbol indexes added. # # vmlinux # ^ @@ -16,6 +19,9 @@ # +--< $(KBUILD_VMLINUX_MAIN) # | +--< drivers/built-in.o mm/built-in.o + more # | +# +--< $(KBUILD_VMLINUX_LIBS) +# | +--< lib/lib.a + more +# | # +-< ${kallsymso} (see description in KALLSYMS section) # # vmlinux version (uname -v) cannot be updated during normal @@ -37,12 +43,47 @@ info() fi } +# Thin archive build here makes a final archive with symbol table and indexes +# from vmlinux objects INIT and MAIN, which can be used as input to linker. +# KBUILD_VMLINUX_LIBS archives should already have symbol table and indexes +# added. +# +# Traditional incremental style of link does not require this step +# +# built-in.o output file +# +archive_builtin() +{ + if [ -n "${CONFIG_THIN_ARCHIVES}" ]; then + info AR built-in.o + rm -f built-in.o; + ${AR} rcsTP${KBUILD_ARFLAGS} built-in.o \ + ${KBUILD_VMLINUX_INIT} \ + ${KBUILD_VMLINUX_MAIN} + fi +} + # Link of vmlinux.o used for section mismatch analysis # ${1} output file modpost_link() { - ${LD} ${LDFLAGS} -r -o ${1} ${KBUILD_VMLINUX_INIT} \ - --start-group ${KBUILD_VMLINUX_MAIN} --end-group + local objects + + if [ -n "${CONFIG_THIN_ARCHIVES}" ]; then + objects="--whole-archive \ + built-in.o \ + --no-whole-archive \ + --start-group \ + ${KBUILD_VMLINUX_LIBS} \ + --end-group" + else + objects="${KBUILD_VMLINUX_INIT} \ + --start-group \ + ${KBUILD_VMLINUX_MAIN} \ + ${KBUILD_VMLINUX_LIBS} \ + --end-group" + fi + ${LD} ${LDFLAGS} -r -o ${1} ${objects} } # Link of vmlinux @@ -51,18 +92,50 @@ modpost_link() vmlinux_link() { local lds="${objtree}/${KBUILD_LDS}" + local objects if [ "${SRCARCH}" != "um" ]; then - ${LD} ${LDFLAGS} ${LDFLAGS_vmlinux} -o ${2} \ - -T ${lds} ${KBUILD_VMLINUX_INIT} \ - --start-group ${KBUILD_VMLINUX_MAIN} --end-group ${1} + if [ -n "${CONFIG_THIN_ARCHIVES}" ]; then + objects="--whole-archive \ + built-in.o \ + --no-whole-archive \ + --start-group \ + ${KBUILD_VMLINUX_LIBS} \ + --end-group \ + ${1}" + else + objects="${KBUILD_VMLINUX_INIT} \ + --start-group \ + ${KBUILD_VMLINUX_MAIN} \ + ${KBUILD_VMLINUX_LIBS} \ + --end-group \ + ${1}" + fi + + ${LD} ${LDFLAGS} ${LDFLAGS_vmlinux} -o ${2} \ + -T ${lds} ${objects} else - ${CC} ${CFLAGS_vmlinux} -o ${2} \ - -Wl,-T,${lds} ${KBUILD_VMLINUX_INIT} \ - -Wl,--start-group \ - ${KBUILD_VMLINUX_MAIN} \ - -Wl,--end-group \ - -lutil -lrt -lpthread ${1} + if [ -n "${CONFIG_THIN_ARCHIVES}" ]; then + objects="-Wl,--whole-archive \ + built-in.o \ + -Wl,--no-whole-archive \ + -Wl,--start-group \ + ${KBUILD_VMLINUX_LIBS} \ + -Wl,--end-group \ + ${1}" + else + objects="${KBUILD_VMLINUX_INIT} \ + -Wl,--start-group \ + ${KBUILD_VMLINUX_MAIN} \ + ${KBUILD_VMLINUX_LIBS} \ + -Wl,--end-group \ + ${1}" + fi + + ${CC} ${CFLAGS_vmlinux} -o ${2} \ + -Wl,-T,${lds} \ + ${objects} \ + -lutil -lrt -lpthread rm -f linux fi } @@ -118,6 +191,7 @@ cleanup() rm -f .tmp_kallsyms* rm -f .tmp_version rm -f .tmp_vmlinux* + rm -f built-in.o rm -f System.map rm -f vmlinux rm -f vmlinux.o @@ -161,13 +235,6 @@ case "${KCONFIG_CONFIG}" in . "./${KCONFIG_CONFIG}" esac -#link vmlinux.o -info LD vmlinux.o -modpost_link vmlinux.o - -# modpost vmlinux.o to check for section mismatches -${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o - # Update version info GEN .version if [ ! -r .version ]; then @@ -181,6 +248,15 @@ fi; # final build of init/ ${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init +archive_builtin + +#link vmlinux.o +info LD vmlinux.o +modpost_link vmlinux.o + +# modpost vmlinux.o to check for section mismatches +${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o + kallsymso="" kallsyms_vmlinux="" if [ -n "${CONFIG_KALLSYMS}" ]; then diff --git a/scripts/sign-file.c b/scripts/sign-file.c index 250a7a645033..b57ade78b24a 100755 --- a/scripts/sign-file.c +++ b/scripts/sign-file.c @@ -28,6 +28,13 @@ #include <openssl/engine.h> /* + * OpenSSL 3.0 deprecates the OpenSSL's ENGINE API. + * + * Remove this if/when that API is no longer used + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +/* * Use CMS if we have openssl-1.0.0 or newer available - otherwise we have to * assume that it's not available and its header file is missing and that we * should use PKCS#7 instead. Switching to the older PKCS#7 format restricts diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index a95b6c98d460..6087f210c93a 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -677,11 +677,11 @@ static const struct kernel_param_ops param_ops_aalockpolicy = { .get = param_get_aalockpolicy }; -static int param_set_audit(const char *val, struct kernel_param *kp); -static int param_get_audit(char *buffer, struct kernel_param *kp); +static int param_set_audit(const char *val, const struct kernel_param *kp); +static int param_get_audit(char *buffer, const struct kernel_param *kp); -static int param_set_mode(const char *val, struct kernel_param *kp); -static int param_get_mode(char *buffer, struct kernel_param *kp); +static int param_set_mode(const char *val, const struct kernel_param *kp); +static int param_get_mode(char *buffer, const struct kernel_param *kp); /* Flag values, also controllable via /sys/module/apparmor/parameters * We define special types as we want to do additional mediation. @@ -789,7 +789,7 @@ static int param_get_aauint(char *buffer, const struct kernel_param *kp) return param_get_uint(buffer, kp); } -static int param_get_audit(char *buffer, struct kernel_param *kp) +static int param_get_audit(char *buffer, const struct kernel_param *kp) { if (!policy_view_capable()) return -EPERM; @@ -800,7 +800,7 @@ static int param_get_audit(char *buffer, struct kernel_param *kp) return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]); } -static int param_set_audit(const char *val, struct kernel_param *kp) +static int param_set_audit(const char *val, const struct kernel_param *kp) { int i; if (!policy_admin_capable()) @@ -822,7 +822,7 @@ static int param_set_audit(const char *val, struct kernel_param *kp) return -EINVAL; } -static int param_get_mode(char *buffer, struct kernel_param *kp) +static int param_get_mode(char *buffer, const struct kernel_param *kp) { if (!policy_admin_capable()) return -EPERM; @@ -833,7 +833,7 @@ static int param_get_mode(char *buffer, struct kernel_param *kp) return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]); } -static int param_set_mode(const char *val, struct kernel_param *kp) +static int param_set_mode(const char *val, const struct kernel_param *kp) { int i; if (!policy_admin_capable()) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index b967dcadcb4d..3d24f86b423f 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2774,7 +2774,7 @@ static int selinux_sb_kern_mount(struct super_block *sb, int flags, void *data) return rc; /* Allow all mounts performed by the kernel */ - if (flags & MS_KERNMOUNT) + if (flags & (MS_KERNMOUNT | MS_SUBMOUNT)) return 0; ad.type = LSM_AUDIT_DATA_DENTRY; diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index e27b71a5d8d5..d1ee3dfb2555 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -105,7 +105,8 @@ struct security_class_mapping secclass_map[] = { { COMMON_IPC_PERMS, NULL } }, { "netlink_route_socket", { COMMON_SOCK_PERMS, - "nlmsg_read", "nlmsg_write", "nlmsg_readpriv", NULL } }, + "nlmsg_read", "nlmsg_write", "nlmsg_readpriv", "nlmsg_getneigh", + NULL } }, { "netlink_tcpdiag_socket", { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", NULL } }, diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index b45a3a72c161..efb852e076b6 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -79,6 +79,7 @@ enum { #define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1) extern int selinux_android_netlink_route; +extern int selinux_android_netlink_getneigh; extern int selinux_policycap_netpeer; extern int selinux_policycap_openperm; extern int selinux_policycap_alwaysnetwork; diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 78a8c420b1f5..c321361af6bf 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -192,12 +192,12 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) return err; } -static void nlmsg_set_getlink_perm(u32 perm) +static void nlmsg_set_perm_for_type(u32 perm, u16 type) { int i; for (i = 0; i < ARRAY_SIZE(nlmsg_route_perms); i++) { - if (nlmsg_route_perms[i].nlmsg_type == RTM_GETLINK) { + if (nlmsg_route_perms[i].nlmsg_type == type) { nlmsg_route_perms[i].perm = perm; break; } @@ -207,11 +207,27 @@ static void nlmsg_set_getlink_perm(u32 perm) /** * Use nlmsg_readpriv as the permission for RTM_GETLINK messages if the * netlink_route_getlink policy capability is set. Otherwise use nlmsg_read. + * Similarly, use nlmsg_getneigh for RTM_GETNEIGH and RTM_GETNEIGHTBL if the + * netlink_route_getneigh policy capability is set. Otherwise use nlmsg_read. */ void selinux_nlmsg_init(void) { if (selinux_android_netlink_route) - nlmsg_set_getlink_perm(NETLINK_ROUTE_SOCKET__NLMSG_READPRIV); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READPRIV, + RTM_GETLINK); else - nlmsg_set_getlink_perm(NETLINK_ROUTE_SOCKET__NLMSG_READ); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READ, + RTM_GETLINK); + + if (selinux_android_netlink_getneigh) { + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_GETNEIGH, + RTM_GETNEIGH); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_GETNEIGH, + RTM_GETNEIGHTBL); + } else { + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READ, + RTM_GETNEIGH); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READ, + RTM_GETNEIGHTBL); + } } diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 5ee23e3a3678..c5b2940f4dab 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -2333,6 +2333,10 @@ int policydb_read(struct policydb *p, void *fp) p->android_netlink_route = 1; } + if ((le32_to_cpu(buf[1]) & POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH)) { + p->android_netlink_getneigh = 1; + } + if (p->policyvers >= POLICYDB_VERSION_POLCAP) { rc = ebitmap_read(&p->policycaps, fp); if (rc) diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index 0d511cf3c1e9..45698f754766 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -228,6 +228,7 @@ struct genfs { struct policydb { int mls_enabled; int android_netlink_route; + int android_netlink_getneigh; /* symbol tables */ struct symtab symtab[SYM_NUM]; @@ -315,6 +316,7 @@ extern int policydb_write(struct policydb *p, void *fp); #define POLICYDB_CONFIG_MLS 1 #define POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE (1 << 31) +#define POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH (1 << 30) /* the config flags related to unknown classes/perms are bits 2 and 3 */ #define REJECT_UNKNOWN 0x00000002 diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 52cf500a8418..b615627d1abc 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -71,6 +71,7 @@ #include "audit.h" int selinux_android_netlink_route; +int selinux_android_netlink_getneigh; int selinux_policycap_netpeer; int selinux_policycap_openperm; int selinux_policycap_alwaysnetwork; @@ -2000,6 +2001,7 @@ static void security_load_policycaps(void) POLICYDB_CAPABILITY_ALWAYSNETWORK); selinux_android_netlink_route = policydb.android_netlink_route; + selinux_android_netlink_getneigh = policydb.android_netlink_getneigh; selinux_nlmsg_init(); } diff --git a/sound/core/control.c b/sound/core/control.c index 43c8eac250b8..04b939df3768 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -877,24 +877,18 @@ static int snd_ctl_elem_read(struct snd_card *card, struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; unsigned int index_offset; - int result; - down_read(&card->controls_rwsem); kctl = snd_ctl_find_id(card, &control->id); - if (kctl == NULL) { - result = -ENOENT; - } else { - index_offset = snd_ctl_get_ioff(kctl, &control->id); - vd = &kctl->vd[index_offset]; - if ((vd->access & SNDRV_CTL_ELEM_ACCESS_READ) && - kctl->get != NULL) { - snd_ctl_build_ioff(&control->id, kctl, index_offset); - result = kctl->get(kctl, control); - } else - result = -EPERM; - } - up_read(&card->controls_rwsem); - return result; + if (kctl == NULL) + return -ENOENT; + + index_offset = snd_ctl_get_ioff(kctl, &control->id); + vd = &kctl->vd[index_offset]; + if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_READ) || kctl->get == NULL) + return -EPERM; + + snd_ctl_build_ioff(&control->id, kctl, index_offset); + return kctl->get(kctl, control); } static int snd_ctl_elem_read_user(struct snd_card *card, @@ -909,8 +903,11 @@ static int snd_ctl_elem_read_user(struct snd_card *card, snd_power_lock(card); result = snd_power_wait(card, SNDRV_CTL_POWER_D0); - if (result >= 0) + if (result >= 0) { + down_read(&card->controls_rwsem); result = snd_ctl_elem_read(card, control); + up_read(&card->controls_rwsem); + } snd_power_unlock(card); if (result >= 0) if (copy_to_user(_control, control, sizeof(*control))) @@ -927,30 +924,28 @@ static int snd_ctl_elem_write(struct snd_card *card, struct snd_ctl_file *file, unsigned int index_offset; int result; - down_read(&card->controls_rwsem); kctl = snd_ctl_find_id(card, &control->id); - if (kctl == NULL) { - result = -ENOENT; - } else { - index_offset = snd_ctl_get_ioff(kctl, &control->id); - vd = &kctl->vd[index_offset]; - if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_WRITE) || - kctl->put == NULL || - (file && vd->owner && vd->owner != file)) { - result = -EPERM; - } else { - snd_ctl_build_ioff(&control->id, kctl, index_offset); - result = kctl->put(kctl, control); - } - if (result > 0) { - struct snd_ctl_elem_id id = control->id; - up_read(&card->controls_rwsem); - snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &id); - return 0; - } + if (kctl == NULL) + return -ENOENT; + + index_offset = snd_ctl_get_ioff(kctl, &control->id); + vd = &kctl->vd[index_offset]; + if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_WRITE) || kctl->put == NULL || + (file && vd->owner && vd->owner != file)) { + return -EPERM; } - up_read(&card->controls_rwsem); - return result; + + snd_ctl_build_ioff(&control->id, kctl, index_offset); + result = kctl->put(kctl, control); + if (result < 0) + return result; + + if (result > 0) { + struct snd_ctl_elem_id id = control->id; + snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &id); + } + + return 0; } static int snd_ctl_elem_write_user(struct snd_ctl_file *file, @@ -967,8 +962,11 @@ static int snd_ctl_elem_write_user(struct snd_ctl_file *file, card = file->card; snd_power_lock(card); result = snd_power_wait(card, SNDRV_CTL_POWER_D0); - if (result >= 0) + if (result >= 0) { + down_write(&card->controls_rwsem); result = snd_ctl_elem_write(card, file, control); + up_write(&card->controls_rwsem); + } snd_power_unlock(card); if (result >= 0) if (copy_to_user(_control, control, sizeof(*control))) diff --git a/sound/core/control_compat.c b/sound/core/control_compat.c index 3fd7f67e701e..f1df25922bd7 100644 --- a/sound/core/control_compat.c +++ b/sound/core/control_compat.c @@ -320,8 +320,11 @@ static int ctl_elem_read_user(struct snd_card *card, snd_power_lock(card); err = snd_power_wait(card, SNDRV_CTL_POWER_D0); - if (err >= 0) + if (err >= 0) { + down_read(&card->controls_rwsem); err = snd_ctl_elem_read(card, data); + up_read(&card->controls_rwsem); + } snd_power_unlock(card); if (err >= 0) err = copy_ctl_value_to_user(userdata, valuep, data, @@ -349,8 +352,11 @@ static int ctl_elem_write_user(struct snd_ctl_file *file, snd_power_lock(card); err = snd_power_wait(card, SNDRV_CTL_POWER_D0); - if (err >= 0) + if (err >= 0) { + down_write(&card->controls_rwsem); err = snd_ctl_elem_write(card, file, data); + up_write(&card->controls_rwsem); + } snd_power_unlock(card); if (err >= 0) err = copy_ctl_value_to_user(userdata, valuep, data, diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 00326629d4af..69bb3fc38fb2 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,3 +1,5 @@ socket psock_fanout psock_tpacket +reuseport_bpf +reuseport_bpf_cpu diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index fac4782c51d8..c658792d47b4 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -4,7 +4,7 @@ CFLAGS = -Wall -O2 -g CFLAGS += -I../../../../usr/include/ -NET_PROGS = socket psock_fanout psock_tpacket +NET_PROGS = socket psock_fanout psock_tpacket reuseport_bpf reuseport_bpf_cpu all: $(NET_PROGS) %: %.c diff --git a/tools/testing/selftests/net/reuseport_bpf.c b/tools/testing/selftests/net/reuseport_bpf.c new file mode 100644 index 000000000000..b5277106df1f --- /dev/null +++ b/tools/testing/selftests/net/reuseport_bpf.c @@ -0,0 +1,641 @@ +/* + * Test functionality of BPF filters for SO_REUSEPORT. The tests below will use + * a BPF program (both classic and extended) to read the first word from an + * incoming packet (expected to be in network byte-order), calculate a modulus + * of that number, and then dispatch the packet to the Nth socket using the + * result. These tests are run for each supported address family and protocol. + * Additionally, a few edge cases in the implementation are tested. + */ + +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/unistd.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/resource.h> +#include <unistd.h> + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#endif + +struct test_params { + int recv_family; + int send_family; + int protocol; + size_t recv_socks; + uint16_t recv_port; + uint16_t send_port_min; +}; + +static size_t sockaddr_size(void) +{ + return sizeof(struct sockaddr_storage); +} + +static struct sockaddr *new_any_sockaddr(int family, uint16_t port) +{ + struct sockaddr_storage *addr; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + addr = malloc(sizeof(struct sockaddr_storage)); + memset(addr, 0, sizeof(struct sockaddr_storage)); + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + addr4->sin_family = AF_INET; + addr4->sin_addr.s_addr = htonl(INADDR_ANY); + addr4->sin_port = htons(port); + break; + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + addr6->sin6_family = AF_INET6; + addr6->sin6_addr = in6addr_any; + addr6->sin6_port = htons(port); + break; + default: + error(1, 0, "Unsupported family %d", family); + } + return (struct sockaddr *)addr; +} + +static struct sockaddr *new_loopback_sockaddr(int family, uint16_t port) +{ + struct sockaddr *addr = new_any_sockaddr(family, port); + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + addr6->sin6_addr = in6addr_loopback; + break; + default: + error(1, 0, "Unsupported family %d", family); + } + return addr; +} + +static void attach_ebpf(int fd, uint16_t mod) +{ + static char bpf_log_buf[65536]; + static const char bpf_license[] = "GPL"; + + int bpf_fd; + const struct bpf_insn prog[] = { + /* BPF_MOV64_REG(BPF_REG_6, BPF_REG_1) */ + { BPF_ALU64 | BPF_MOV | BPF_X, BPF_REG_6, BPF_REG_1, 0, 0 }, + /* BPF_LD_ABS(BPF_W, 0) R0 = (uint32_t)skb[0] */ + { BPF_LD | BPF_ABS | BPF_W, 0, 0, 0, 0 }, + /* BPF_ALU64_IMM(BPF_MOD, BPF_REG_0, mod) */ + { BPF_ALU64 | BPF_MOD | BPF_K, BPF_REG_0, 0, 0, mod }, + /* BPF_EXIT_INSN() */ + { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 } + }; + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insn_cnt = ARRAY_SIZE(prog); + attr.insns = (unsigned long) &prog; + attr.license = (unsigned long) &bpf_license; + attr.log_buf = (unsigned long) &bpf_log_buf; + attr.log_size = sizeof(bpf_log_buf); + attr.log_level = 1; + attr.kern_version = 0; + + bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + if (bpf_fd < 0) + error(1, errno, "ebpf error. log:\n%s\n", bpf_log_buf); + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &bpf_fd, + sizeof(bpf_fd))) + error(1, errno, "failed to set SO_ATTACH_REUSEPORT_EBPF"); + + close(bpf_fd); +} + +static void attach_cbpf(int fd, uint16_t mod) +{ + struct sock_filter code[] = { + /* A = (uint32_t)skb[0] */ + { BPF_LD | BPF_W | BPF_ABS, 0, 0, 0 }, + /* A = A % mod */ + { BPF_ALU | BPF_MOD, 0, 0, mod }, + /* return A */ + { BPF_RET | BPF_A, 0, 0, 0 }, + }; + struct sock_fprog p = { + .len = ARRAY_SIZE(code), + .filter = code, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) + error(1, errno, "failed to set SO_ATTACH_REUSEPORT_CBPF"); +} + +static void build_recv_group(const struct test_params p, int fd[], uint16_t mod, + void (*attach_bpf)(int, uint16_t)) +{ + struct sockaddr * const addr = + new_any_sockaddr(p.recv_family, p.recv_port); + int i, opt; + + for (i = 0; i < p.recv_socks; ++i) { + fd[i] = socket(p.recv_family, p.protocol, 0); + if (fd[i] < 0) + error(1, errno, "failed to create recv %d", i); + + opt = 1; + if (setsockopt(fd[i], SOL_SOCKET, SO_REUSEPORT, &opt, + sizeof(opt))) + error(1, errno, "failed to set SO_REUSEPORT on %d", i); + + if (i == 0) + attach_bpf(fd[i], mod); + + if (bind(fd[i], addr, sockaddr_size())) + error(1, errno, "failed to bind recv socket %d", i); + + if (p.protocol == SOCK_STREAM) { + opt = 4; + if (setsockopt(fd[i], SOL_TCP, TCP_FASTOPEN, &opt, + sizeof(opt))) + error(1, errno, + "failed to set TCP_FASTOPEN on %d", i); + if (listen(fd[i], p.recv_socks * 10)) + error(1, errno, "failed to listen on socket"); + } + } + free(addr); +} + +static void send_from(struct test_params p, uint16_t sport, char *buf, + size_t len) +{ + struct sockaddr * const saddr = new_any_sockaddr(p.send_family, sport); + struct sockaddr * const daddr = + new_loopback_sockaddr(p.send_family, p.recv_port); + const int fd = socket(p.send_family, p.protocol, 0), one = 1; + + if (fd < 0) + error(1, errno, "failed to create send socket"); + + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) + error(1, errno, "failed to set reuseaddr"); + + if (bind(fd, saddr, sockaddr_size())) + error(1, errno, "failed to bind send socket"); + + if (sendto(fd, buf, len, MSG_FASTOPEN, daddr, sockaddr_size()) < 0) + error(1, errno, "failed to send message"); + + close(fd); + free(saddr); + free(daddr); +} + +static void test_recv_order(const struct test_params p, int fd[], int mod) +{ + char recv_buf[8], send_buf[8]; + struct msghdr msg; + struct iovec recv_io = { recv_buf, 8 }; + struct epoll_event ev; + int epfd, conn, i, sport, expected; + uint32_t data, ndata; + + epfd = epoll_create(1); + if (epfd < 0) + error(1, errno, "failed to create epoll"); + for (i = 0; i < p.recv_socks; ++i) { + ev.events = EPOLLIN; + ev.data.fd = fd[i]; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd[i], &ev)) + error(1, errno, "failed to register sock %d epoll", i); + } + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &recv_io; + msg.msg_iovlen = 1; + + for (data = 0; data < p.recv_socks * 2; ++data) { + sport = p.send_port_min + data; + ndata = htonl(data); + memcpy(send_buf, &ndata, sizeof(ndata)); + send_from(p, sport, send_buf, sizeof(ndata)); + + i = epoll_wait(epfd, &ev, 1, -1); + if (i < 0) + error(1, errno, "epoll wait failed"); + + if (p.protocol == SOCK_STREAM) { + conn = accept(ev.data.fd, NULL, NULL); + if (conn < 0) + error(1, errno, "error accepting"); + i = recvmsg(conn, &msg, 0); + close(conn); + } else { + i = recvmsg(ev.data.fd, &msg, 0); + } + if (i < 0) + error(1, errno, "recvmsg error"); + if (i != sizeof(ndata)) + error(1, 0, "expected size %zd got %d", + sizeof(ndata), i); + + for (i = 0; i < p.recv_socks; ++i) + if (ev.data.fd == fd[i]) + break; + memcpy(&ndata, recv_buf, sizeof(ndata)); + fprintf(stderr, "Socket %d: %d\n", i, ntohl(ndata)); + + expected = (sport % mod); + if (i != expected) + error(1, 0, "expected socket %d", expected); + } +} + +static void test_reuseport_ebpf(struct test_params p) +{ + int i, fd[p.recv_socks]; + + fprintf(stderr, "Testing EBPF mod %zd...\n", p.recv_socks); + build_recv_group(p, fd, p.recv_socks, attach_ebpf); + test_recv_order(p, fd, p.recv_socks); + + p.send_port_min += p.recv_socks * 2; + fprintf(stderr, "Reprograming, testing mod %zd...\n", p.recv_socks / 2); + attach_ebpf(fd[0], p.recv_socks / 2); + test_recv_order(p, fd, p.recv_socks / 2); + + for (i = 0; i < p.recv_socks; ++i) + close(fd[i]); +} + +static void test_reuseport_cbpf(struct test_params p) +{ + int i, fd[p.recv_socks]; + + fprintf(stderr, "Testing CBPF mod %zd...\n", p.recv_socks); + build_recv_group(p, fd, p.recv_socks, attach_cbpf); + test_recv_order(p, fd, p.recv_socks); + + p.send_port_min += p.recv_socks * 2; + fprintf(stderr, "Reprograming, testing mod %zd...\n", p.recv_socks / 2); + attach_cbpf(fd[0], p.recv_socks / 2); + test_recv_order(p, fd, p.recv_socks / 2); + + for (i = 0; i < p.recv_socks; ++i) + close(fd[i]); +} + +static void test_extra_filter(const struct test_params p) +{ + struct sockaddr * const addr = + new_any_sockaddr(p.recv_family, p.recv_port); + int fd1, fd2, opt; + + fprintf(stderr, "Testing too many filters...\n"); + fd1 = socket(p.recv_family, p.protocol, 0); + if (fd1 < 0) + error(1, errno, "failed to create socket 1"); + fd2 = socket(p.recv_family, p.protocol, 0); + if (fd2 < 0) + error(1, errno, "failed to create socket 2"); + + opt = 1; + if (setsockopt(fd1, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt))) + error(1, errno, "failed to set SO_REUSEPORT on socket 1"); + if (setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt))) + error(1, errno, "failed to set SO_REUSEPORT on socket 2"); + + attach_ebpf(fd1, 10); + attach_ebpf(fd2, 10); + + if (bind(fd1, addr, sockaddr_size())) + error(1, errno, "failed to bind recv socket 1"); + + if (!bind(fd2, addr, sockaddr_size()) && errno != EADDRINUSE) + error(1, errno, "bind socket 2 should fail with EADDRINUSE"); + + free(addr); +} + +static void test_filter_no_reuseport(const struct test_params p) +{ + struct sockaddr * const addr = + new_any_sockaddr(p.recv_family, p.recv_port); + const char bpf_license[] = "GPL"; + struct bpf_insn ecode[] = { + { BPF_ALU64 | BPF_MOV | BPF_K, BPF_REG_0, 0, 0, 10 }, + { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 } + }; + struct sock_filter ccode[] = {{ BPF_RET | BPF_A, 0, 0, 0 }}; + union bpf_attr eprog; + struct sock_fprog cprog; + int fd, bpf_fd; + + fprintf(stderr, "Testing filters on non-SO_REUSEPORT socket...\n"); + + memset(&eprog, 0, sizeof(eprog)); + eprog.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + eprog.insn_cnt = ARRAY_SIZE(ecode); + eprog.insns = (unsigned long) &ecode; + eprog.license = (unsigned long) &bpf_license; + eprog.kern_version = 0; + + memset(&cprog, 0, sizeof(cprog)); + cprog.len = ARRAY_SIZE(ccode); + cprog.filter = ccode; + + + bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &eprog, sizeof(eprog)); + if (bpf_fd < 0) + error(1, errno, "ebpf error"); + fd = socket(p.recv_family, p.protocol, 0); + if (fd < 0) + error(1, errno, "failed to create socket 1"); + + if (bind(fd, addr, sockaddr_size())) + error(1, errno, "failed to bind recv socket 1"); + + errno = 0; + if (!setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &bpf_fd, + sizeof(bpf_fd)) || errno != EINVAL) + error(1, errno, "setsockopt should have returned EINVAL"); + + errno = 0; + if (!setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &cprog, + sizeof(cprog)) || errno != EINVAL) + error(1, errno, "setsockopt should have returned EINVAL"); + + free(addr); +} + +static void test_filter_without_bind(void) +{ + int fd1, fd2, opt = 1; + + fprintf(stderr, "Testing filter add without bind...\n"); + fd1 = socket(AF_INET, SOCK_DGRAM, 0); + if (fd1 < 0) + error(1, errno, "failed to create socket 1"); + fd2 = socket(AF_INET, SOCK_DGRAM, 0); + if (fd2 < 0) + error(1, errno, "failed to create socket 2"); + if (setsockopt(fd1, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt))) + error(1, errno, "failed to set SO_REUSEPORT on socket 1"); + if (setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt))) + error(1, errno, "failed to set SO_REUSEPORT on socket 2"); + + attach_ebpf(fd1, 10); + attach_cbpf(fd2, 10); + + close(fd1); + close(fd2); +} + +void enable_fastopen(void) +{ + int fd = open("/proc/sys/net/ipv4/tcp_fastopen", 0); + int rw_mask = 3; /* bit 1: client side; bit-2 server side */ + int val, size; + char buf[16]; + + if (fd < 0) + error(1, errno, "Unable to open tcp_fastopen sysctl"); + if (read(fd, buf, sizeof(buf)) <= 0) + error(1, errno, "Unable to read tcp_fastopen sysctl"); + val = atoi(buf); + close(fd); + + if ((val & rw_mask) != rw_mask) { + fd = open("/proc/sys/net/ipv4/tcp_fastopen", O_RDWR); + if (fd < 0) + error(1, errno, + "Unable to open tcp_fastopen sysctl for writing"); + val |= rw_mask; + size = snprintf(buf, 16, "%d", val); + if (write(fd, buf, size) <= 0) + error(1, errno, "Unable to write tcp_fastopen sysctl"); + close(fd); + } +} + +static struct rlimit rlim_old; + +static __attribute__((constructor)) void main_ctor(void) +{ + getrlimit(RLIMIT_MEMLOCK, &rlim_old); + + if (rlim_old.rlim_cur != RLIM_INFINITY) { + struct rlimit rlim_new; + + rlim_new.rlim_cur = rlim_old.rlim_cur + (1UL << 20); + rlim_new.rlim_max = rlim_old.rlim_max + (1UL << 20); + setrlimit(RLIMIT_MEMLOCK, &rlim_new); + } +} + +static __attribute__((destructor)) void main_dtor(void) +{ + setrlimit(RLIMIT_MEMLOCK, &rlim_old); +} + +int main(void) +{ + fprintf(stderr, "---- IPv4 UDP ----\n"); + /* NOTE: UDP socket lookups traverse a different code path when there + * are > 10 sockets in a group. Run the bpf test through both paths. + */ + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8000, + .send_port_min = 9000}); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 20, + .recv_port = 8000, + .send_port_min = 9000}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8001, + .send_port_min = 9020}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 20, + .recv_port = 8001, + .send_port_min = 9020}); + test_extra_filter((struct test_params) { + .recv_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_port = 8002}); + test_filter_no_reuseport((struct test_params) { + .recv_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_port = 8008}); + + fprintf(stderr, "---- IPv6 UDP ----\n"); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8003, + .send_port_min = 9040}); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_socks = 20, + .recv_port = 8003, + .send_port_min = 9040}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8004, + .send_port_min = 9060}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_socks = 20, + .recv_port = 8004, + .send_port_min = 9060}); + test_extra_filter((struct test_params) { + .recv_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_port = 8005}); + test_filter_no_reuseport((struct test_params) { + .recv_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_port = 8009}); + + fprintf(stderr, "---- IPv6 UDP w/ mapped IPv4 ----\n"); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 20, + .recv_port = 8006, + .send_port_min = 9080}); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8006, + .send_port_min = 9080}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8007, + .send_port_min = 9100}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 20, + .recv_port = 8007, + .send_port_min = 9100}); + + /* TCP fastopen is required for the TCP tests */ + enable_fastopen(); + fprintf(stderr, "---- IPv4 TCP ----\n"); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET, + .send_family = AF_INET, + .protocol = SOCK_STREAM, + .recv_socks = 10, + .recv_port = 8008, + .send_port_min = 9120}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET, + .send_family = AF_INET, + .protocol = SOCK_STREAM, + .recv_socks = 10, + .recv_port = 8009, + .send_port_min = 9160}); + test_extra_filter((struct test_params) { + .recv_family = AF_INET, + .protocol = SOCK_STREAM, + .recv_port = 8010}); + test_filter_no_reuseport((struct test_params) { + .recv_family = AF_INET, + .protocol = SOCK_STREAM, + .recv_port = 8011}); + + fprintf(stderr, "---- IPv6 TCP ----\n"); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET6, + .protocol = SOCK_STREAM, + .recv_socks = 10, + .recv_port = 8012, + .send_port_min = 9200}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET6, + .protocol = SOCK_STREAM, + .recv_socks = 10, + .recv_port = 8013, + .send_port_min = 9240}); + test_extra_filter((struct test_params) { + .recv_family = AF_INET6, + .protocol = SOCK_STREAM, + .recv_port = 8014}); + test_filter_no_reuseport((struct test_params) { + .recv_family = AF_INET6, + .protocol = SOCK_STREAM, + .recv_port = 8015}); + + fprintf(stderr, "---- IPv6 TCP w/ mapped IPv4 ----\n"); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET, + .protocol = SOCK_STREAM, + .recv_socks = 10, + .recv_port = 8016, + .send_port_min = 9320}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET, + .protocol = SOCK_STREAM, + .recv_socks = 10, + .recv_port = 8017, + .send_port_min = 9360}); + + test_filter_without_bind(); + + fprintf(stderr, "SUCCESS\n"); + return 0; +} diff --git a/tools/testing/selftests/net/reuseport_bpf_cpu.c b/tools/testing/selftests/net/reuseport_bpf_cpu.c new file mode 100644 index 000000000000..b23d6f54de7b --- /dev/null +++ b/tools/testing/selftests/net/reuseport_bpf_cpu.c @@ -0,0 +1,258 @@ +/* + * Test functionality of BPF filters with SO_REUSEPORT. This program creates + * an SO_REUSEPORT receiver group containing one socket per CPU core. It then + * creates a BPF program that will select a socket from this group based + * on the core id that receives the packet. The sending code artificially + * moves itself to run on different core ids and sends one message from + * each core. Since these packets are delivered over loopback, they should + * arrive on the same core that sent them. The receiving code then ensures + * that the packet was received on the socket for the corresponding core id. + * This entire process is done for several different core id permutations + * and for each IPv4/IPv6 and TCP/UDP combination. + */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <error.h> +#include <linux/filter.h> +#include <linux/in.h> +#include <linux/unistd.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <unistd.h> + +static const int PORT = 8888; + +static void build_rcv_group(int *rcv_fd, size_t len, int family, int proto) +{ + struct sockaddr_storage addr; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + size_t i; + int opt; + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)&addr; + addr4->sin_family = AF_INET; + addr4->sin_addr.s_addr = htonl(INADDR_ANY); + addr4->sin_port = htons(PORT); + break; + case AF_INET6: + addr6 = (struct sockaddr_in6 *)&addr; + addr6->sin6_family = AF_INET6; + addr6->sin6_addr = in6addr_any; + addr6->sin6_port = htons(PORT); + break; + default: + error(1, 0, "Unsupported family %d", family); + } + + for (i = 0; i < len; ++i) { + rcv_fd[i] = socket(family, proto, 0); + if (rcv_fd[i] < 0) + error(1, errno, "failed to create receive socket"); + + opt = 1; + if (setsockopt(rcv_fd[i], SOL_SOCKET, SO_REUSEPORT, &opt, + sizeof(opt))) + error(1, errno, "failed to set SO_REUSEPORT"); + + if (bind(rcv_fd[i], (struct sockaddr *)&addr, sizeof(addr))) + error(1, errno, "failed to bind receive socket"); + + if (proto == SOCK_STREAM && listen(rcv_fd[i], len * 10)) + error(1, errno, "failed to listen on receive port"); + } +} + +static void attach_bpf(int fd) +{ + struct sock_filter code[] = { + /* A = raw_smp_processor_id() */ + { BPF_LD | BPF_W | BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_CPU }, + /* return A */ + { BPF_RET | BPF_A, 0, 0, 0 }, + }; + struct sock_fprog p = { + .len = 2, + .filter = code, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) + error(1, errno, "failed to set SO_ATTACH_REUSEPORT_CBPF"); +} + +static void send_from_cpu(int cpu_id, int family, int proto) +{ + struct sockaddr_storage saddr, daddr; + struct sockaddr_in *saddr4, *daddr4; + struct sockaddr_in6 *saddr6, *daddr6; + cpu_set_t cpu_set; + int fd; + + switch (family) { + case AF_INET: + saddr4 = (struct sockaddr_in *)&saddr; + saddr4->sin_family = AF_INET; + saddr4->sin_addr.s_addr = htonl(INADDR_ANY); + saddr4->sin_port = 0; + + daddr4 = (struct sockaddr_in *)&daddr; + daddr4->sin_family = AF_INET; + daddr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + daddr4->sin_port = htons(PORT); + break; + case AF_INET6: + saddr6 = (struct sockaddr_in6 *)&saddr; + saddr6->sin6_family = AF_INET6; + saddr6->sin6_addr = in6addr_any; + saddr6->sin6_port = 0; + + daddr6 = (struct sockaddr_in6 *)&daddr; + daddr6->sin6_family = AF_INET6; + daddr6->sin6_addr = in6addr_loopback; + daddr6->sin6_port = htons(PORT); + break; + default: + error(1, 0, "Unsupported family %d", family); + } + + memset(&cpu_set, 0, sizeof(cpu_set)); + CPU_SET(cpu_id, &cpu_set); + if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) + error(1, errno, "failed to pin to cpu"); + + fd = socket(family, proto, 0); + if (fd < 0) + error(1, errno, "failed to create send socket"); + + if (bind(fd, (struct sockaddr *)&saddr, sizeof(saddr))) + error(1, errno, "failed to bind send socket"); + + if (connect(fd, (struct sockaddr *)&daddr, sizeof(daddr))) + error(1, errno, "failed to connect send socket"); + + if (send(fd, "a", 1, 0) < 0) + error(1, errno, "failed to send message"); + + close(fd); +} + +static +void receive_on_cpu(int *rcv_fd, int len, int epfd, int cpu_id, int proto) +{ + struct epoll_event ev; + int i, fd; + char buf[8]; + + i = epoll_wait(epfd, &ev, 1, -1); + if (i < 0) + error(1, errno, "epoll_wait failed"); + + if (proto == SOCK_STREAM) { + fd = accept(ev.data.fd, NULL, NULL); + if (fd < 0) + error(1, errno, "failed to accept"); + i = recv(fd, buf, sizeof(buf), 0); + close(fd); + } else { + i = recv(ev.data.fd, buf, sizeof(buf), 0); + } + + if (i < 0) + error(1, errno, "failed to recv"); + + for (i = 0; i < len; ++i) + if (ev.data.fd == rcv_fd[i]) + break; + if (i == len) + error(1, 0, "failed to find socket"); + fprintf(stderr, "send cpu %d, receive socket %d\n", cpu_id, i); + if (cpu_id != i) + error(1, 0, "cpu id/receive socket mismatch"); +} + +static void test(int *rcv_fd, int len, int family, int proto) +{ + struct epoll_event ev; + int epfd, cpu; + + build_rcv_group(rcv_fd, len, family, proto); + attach_bpf(rcv_fd[0]); + + epfd = epoll_create(1); + if (epfd < 0) + error(1, errno, "failed to create epoll"); + for (cpu = 0; cpu < len; ++cpu) { + ev.events = EPOLLIN; + ev.data.fd = rcv_fd[cpu]; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, rcv_fd[cpu], &ev)) + error(1, errno, "failed to register sock epoll"); + } + + /* Forward iterate */ + for (cpu = 0; cpu < len; ++cpu) { + send_from_cpu(cpu, family, proto); + receive_on_cpu(rcv_fd, len, epfd, cpu, proto); + } + + /* Reverse iterate */ + for (cpu = len - 1; cpu >= 0; --cpu) { + send_from_cpu(cpu, family, proto); + receive_on_cpu(rcv_fd, len, epfd, cpu, proto); + } + + /* Even cores */ + for (cpu = 0; cpu < len; cpu += 2) { + send_from_cpu(cpu, family, proto); + receive_on_cpu(rcv_fd, len, epfd, cpu, proto); + } + + /* Odd cores */ + for (cpu = 1; cpu < len; cpu += 2) { + send_from_cpu(cpu, family, proto); + receive_on_cpu(rcv_fd, len, epfd, cpu, proto); + } + + close(epfd); + for (cpu = 0; cpu < len; ++cpu) + close(rcv_fd[cpu]); +} + +int main(void) +{ + int *rcv_fd, cpus; + + cpus = sysconf(_SC_NPROCESSORS_ONLN); + if (cpus <= 0) + error(1, errno, "failed counting cpus"); + + rcv_fd = calloc(cpus, sizeof(int)); + if (!rcv_fd) + error(1, 0, "failed to allocate array"); + + fprintf(stderr, "---- IPv4 UDP ----\n"); + test(rcv_fd, cpus, AF_INET, SOCK_DGRAM); + + fprintf(stderr, "---- IPv6 UDP ----\n"); + test(rcv_fd, cpus, AF_INET6, SOCK_DGRAM); + + fprintf(stderr, "---- IPv4 TCP ----\n"); + test(rcv_fd, cpus, AF_INET, SOCK_STREAM); + + fprintf(stderr, "---- IPv6 TCP ----\n"); + test(rcv_fd, cpus, AF_INET6, SOCK_STREAM); + + free(rcv_fd); + + fprintf(stderr, "SUCCESS\n"); + return 0; +} |