diff options
| author | Davide Garberi <dade.garberi@gmail.com> | 2023-05-28 15:57:41 +0200 |
|---|---|---|
| committer | Davide Garberi <dade.garberi@gmail.com> | 2023-08-03 02:55:58 +0200 |
| commit | ce2e1dc133974cf19668249e2f527bb98ddf449f (patch) | |
| tree | 7c462c49b405757413d42411042274e035debab4 /kernel | |
| parent | 7ef1b1f38a94bf54b215c4c56ea8d62ec607f73b (diff) | |
| parent | 1a4b80f8f2017576f530658f48659d9a222f4648 (diff) | |
Merge lineage-20 of git@github.com:LineageOS/android_kernel_qcom_msm8998.git into lineage-20
1a4b80f8f201 ANDROID: arch:arm64: Increase kernel command line size
7c253f7aa663 of: reserved_mem: increase max number reserved regions
df4dbf557503 msm: camera: Fix indentations
2fc4a156d15d msm: camera: Fix code flow when populating CAM_V_CUSTOM1
687bcb61f125 ALSA: control: use counting semaphore as write lock for ELEM_WRITE operation
75cf9e8c1b1c ALSA: control: Fix memory corruption risk in snd_ctl_elem_read
76cf3b5e53df ALSA: control: code refactoring for ELEM_READ/ELEM_WRITE operations
e9af212f9685 ALSA: pcm: Move rwsem lock inside snd_ctl_elem_read to prevent UAF
95fc4fff573f msm: kgsl: Make sure that pool pages don't have any extra references
59ceabe0d242 msm: kgsl: Use dma_buf_get() to get dma_buf structure
d1f19956d6b9 ANDROID: usb: f_accessory: Check buffer size when initialised via composite
2d3ce4f7a366 kbuild: handle libs-y archives separately from built-in.o archives
65dc3fbd1593 kbuild: thin archives use P option to ar
362c7b73bac8 kbuild: thin archives for multi-y targets
43076241b514 kbuild: thin archives final link close --whole-archives option
aa04fc78256d kbuild: minor improvement for thin archives build
f5896747cda6 Merge tag 'LA.UM.7.2.c25-07700-sdm660.0' of https://git.codelinaro.org/clo/la/platform/vendor/qcom-opensource/wlan/qcacld-3.0 into android13-4.4-msm8998
321ac077ee7e qcacld-3.0: Fix out-of-bounds in tx_stats
42be8e4cbf13 BACKPORT: usb: gadget: rndis: prevent integer overflow in rndis_set_response()
b490a85b5945 FROMGIT: arm64: fix oops in concurrently setting insn_emulation sysctls
7ed7084b34a9 FROMLIST: binder: fix UAF of ref->proc caused by race condition
e31f087fb864 ANDROID: selinux: modify RTM_GETNEIGH{TBL}
80675d431434 UPSTREAM: usb: gadget: clear related members when goto fail
fb6adfb00108 UPSTREAM: usb: gadget: don't release an existing dev->buf
e4a8dd12424e UPSTREAM: USB: gadget: validate interface OS descriptor requests
8f0a947317e0 UPSTREAM: usb: gadget: rndis: check size of RNDIS_MSG_SET command
1541758765ff ion: Do not 'put' ION handle until after its final use
03b4b3cd8d30 Merge tag 'LA.UM.7.2.c25-07000-sdm660.0' of https://git.codelinaro.org/clo/la/platform/vendor/qcom-opensource/wlan/qcacld-3.0 into android13-4.4-msm8998
7dbda95466d5 Merge tag 'LA.UM.8.4.c25-06600-8x98.0' of https://git.codelinaro.org/clo/la/kernel/msm-4.4 into android13-4.4-msm8998
369119e5df4e cert host tools: Stop complaining about deprecated OpenSSL functions
f8e30a0f9a17 fixup! BACKPORT: treewide: Fix function prototypes for module_param_call()
4fa5045f3dc9 arm64/efi: Mark __efistub_stext_offset as an absolute symbol explicitly
bcd9668da77f arm64: kernel: do not need to reset UAO on exception entry
c4ddd677f7e3 Kbuild: do not emit debug info for assembly with LLVM_IAS=1
1b880b6e19f8 qcacld-3.0: Add time slice duty cycle in wifi_interface_info
fd24be2b22a1 qcacmn: Add time slice duty cycle attribute into QCA vendor command
d719c1c825f8 qcacld-3.0: Use field-by-field assignment for FW stats
fb5eb3bda2d9 ext4: enable quota enforcement based on mount options
cd40d7f301de ext4: adds project ID support
360e2f3d18b8 ext4: add project quota support
c31ac2be1594 drivers: qcacld-3.0: Remove in_compat_syscall() redefinition
6735c13a269d arm64: link with -z norelro regardless of CONFIG_RELOCATABLE
99962aab3433 arm64: relocatable: fix inconsistencies in linker script and options
24bd8cc5e6bb arm64: prevent regressions in compressed kernel image size when upgrading to binutils 2.27
93bb4c2392a2 arm64: kernel: force ET_DYN ELF type for CONFIG_RELOCATABLE=y
a54bbb725ccb arm64: build with baremetal linker target instead of Linux when available
c5805c604a9b arm64: add endianness option to LDFLAGS instead of LD
ab6052788f60 arm64: Set UTS_MACHINE in the Makefile
c3330429b2c6 kbuild: clear LDFLAGS in the top Makefile
f33c1532bd61 kbuild: use HOSTLDFLAGS for single .c executables
38b7db363a96 BACKPORT: arm64: Change .weak to SYM_FUNC_START_WEAK_PI for arch/arm64/lib/mem*.S
716cb63e81d9 BACKPORT: crypto: arm64/aes-ce-cipher - move assembler code to .S file
7dfbaee16432 BACKPORT: arm64: Remove reference to asm/opcodes.h
531ee8624d17 BACKPORT: arm64: kprobe: protect/rename few definitions to be reused by uprobe
08d83c997b0c BACKPORT: arm64: Delete the space separator in __emit_inst
e3951152dc2d BACKPORT: arm64: Get rid of asm/opcodes.h
255820c0f301 BACKPORT: arm64: Fix minor issues with the dcache_by_line_op macro
21bb344a664b BACKPORT: crypto: arm64/aes-modes - get rid of literal load of addend vector
26d5a53c6e0d BACKPORT: arm64: vdso: remove commas between macro name and arguments
78bff1f77c9d BACKPORT: kbuild: support LLVM=1 to switch the default tools to Clang/LLVM
6634f9f63efe BACKPORT: kbuild: replace AS=clang with LLVM_IAS=1
b891e8fdc466 BACKPORT: Documentation/llvm: fix the name of llvm-size
75d6fa8368a8 BACKPORT: Documentation/llvm: add documentation on building w/ Clang/LLVM
95b0a5e52f2a BACKPORT: ANDROID: ftrace: fix function type mismatches
7da9c2138ec8 BACKPORT: ANDROID: fs: logfs: fix filler function type
d6d5a4b28ad0 BACKPORT: ANDROID: fs: gfs2: fix filler function type
9b194a470db5 BACKPORT: ANDROID: fs: exofs: fix filler function type
7a45ac4bfb49 BACKPORT: ANDROID: fs: afs: fix filler function type
4099e1b281e5 BACKPORT: drivers/perf: arm_pmu: fix function type mismatch
af7b738882f7 BACKPORT: dummycon: fix function types
1b0b55a36dbe BACKPORT: fs: nfs: fix filler function type
a58a0e30e20a BACKPORT: mm: fix filler function type mismatch
829e9226a8c0 BACKPORT: mm: fix drain_local_pages function type
865ef61b4da8 BACKPORT: vfs: pass type instead of fn to do_{loop,iter}_readv_writev()
08d2f8e7ba8e BACKPORT: module: Do not paper over type mismatches in module_param_call()
ea467f6c33e4 BACKPORT: treewide: Fix function prototypes for module_param_call()
d131459e6b8b BACKPORT: module: Prepare to convert all module_param_call() prototypes
6f52abadf006 BACKPORT: kbuild: fix --gc-sections
bf7540ffce44 BACKPORT: kbuild: record needed exported symbols for modules
c49d2545e437 BACKPORT: kbuild: Allow to specify composite modules with modname-m
427d0fc67dc1 BACKPORT: kbuild: add arch specific post-link Makefile
69f8a31838a3 BACKPORT: arm64: add a workaround for GNU gold with ARM64_MODULE_PLTS
ba3368756abf BACKPORT: arm64: explicitly pass --no-fix-cortex-a53-843419 to GNU gold
6dacd7e737fb BACKPORT: arm64: errata: Pass --fix-cortex-a53-843419 to ld if workaround enabled
d2787c21f2b5 BACKPORT: kbuild: add __ld-ifversion and linker-specific macros
2d471de60bb4 BACKPORT: kbuild: add ld-name macro
06280a90d845 BACKPORT: arm64: keep .altinstructions and .altinstr_replacement
eb0ad3ae07f9 BACKPORT: kbuild: add __cc-ifversion and compiler-specific variants
3d01e1eba86b BACKPORT: FROMLIST: kbuild: add clang-version.sh
18dd378ab563 BACKPORT: FROMLIST: kbuild: fix LD_DEAD_CODE_DATA_ELIMINATION
aabbc122b1de BACKPORT: kbuild: thin archives make default for all archs
756d47e345fc BACKPORT: kbuild: allow archs to select link dead code/data elimination
723ab99e48a7 BACKPORT: kbuild: allow architectures to use thin archives instead of ld -r
0b77ec583772 drivers/usb/serial/console.c: remove superfluous serial->port condition
6488cb478f04 drivers/firmware/efi/libstub.c: prevent a relocation
dba4259216a0 UPSTREAM: pidfd: fix a poll race when setting exit_state
baab6e33b07b BACKPORT: arch: wire-up pidfd_open()
5d2e9e4f8630 BACKPORT: pid: add pidfd_open()
f8396a127daf UPSTREAM: pidfd: add polling support
f4c358582254 UPSTREAM: signal: improve comments
5500316dc8d8 UPSTREAM: fork: do not release lock that wasn't taken
fc7d707593e3 BACKPORT: signal: support CLONE_PIDFD with pidfd_send_signal
f044fa00d72a BACKPORT: clone: add CLONE_PIDFD
f20fc1c548f2 UPSTREAM: Make anon_inodes unconditional
de80525cd462 UPSTREAM: signal: use fdget() since we don't allow O_PATH
229e1bdd624e UPSTREAM: signal: don't silently convert SI_USER signals to non-current pidfd
ada02e996b52 BACKPORT: signal: add pidfd_send_signal() syscall
828857678c5c compat: add in_compat_syscall to ask whether we're in a compat syscall
e7aede4896c0 bpf: Add new cgroup attach type to enable sock modifications
9ed75228b09c ebpf: allow bpf_get_current_uid_gid_proto also for networking
c5aa3963b4ae bpf: fix overflow in prog accounting
c46a001439fc bpf: Make sure mac_header was set before using it
8aed99185615 bpf: Enlarge offset check value to INT_MAX in bpf_skb_{load,store}_bytes
b0a638335ba6 bpf: avoid false sharing of map refcount with max_entries
1f21605e373c net: remove hlist_nulls_add_tail_rcu()
9ce369b09dbb udp: get rid of SLAB_DESTROY_BY_RCU allocations
070f539fb5d7 udp: no longer use SLAB_DESTROY_BY_RCU
a32d2ea857c5 inet: refactor inet[6]_lookup functions to take skb
fcf3e7bc7203 soreuseport: fix initialization race
df03c8cf024a soreuseport: Fix TCP listener hash collision
bd8b9f50c9d3 inet: Fix missing return value in inet6_hash
bae331196dd0 soreuseport: fast reuseport TCP socket selection
4ada2ed73da0 inet: create IPv6-equivalent inet_hash function
73f609838475 sock: struct proto hash function may error
e3b32750621b cgroup: Fix sock_cgroup_data on big-endian.
69dabcedd4b9 selinux: always allow mounting submounts
17d6ddebcc49 userns: Don't fail follow_automount based on s_user_ns
cbd08255e6f8 fs: Better permission checking for submounts
3a9ace719251 mnt: Move the FS_USERNS_MOUNT check into sget_userns
af53549b43c5 locks: sprinkle some tracepoints around the file locking code
07dbbc84aa34 locks: rename __posix_lock_file to posix_lock_inode
400cbe93d180 autofs: Fix automounts by using current_real_cred()->uid
7903280ee07a fs: Call d_automount with the filesystems creds
b87fb50ff1cd UPSTREAM: kernfs: Check KERNFS_HAS_RELEASE before calling kernfs_release_file()
c9c596de3e52 UPSTREAM: kernfs: fix locking around kernfs_ops->release() callback
2172eaf5a901 UPSTREAM: cgroup, bpf: remove unnecessary #include
dc81f3963dde kernfs: kernfs_sop_show_path: don't return 0 after seq_dentry call
ce9a52e20897 cgroup: Make rebind_subsystems() disable v2 controllers all at once
ce5e3aa14c39 cgroup: fix sock_cgroup_data initialization on earlier compilers
94a70ef24da9 samples/bpf: fix bpf_perf_event_output prototype
c1920272278e net: gso: Fix skb_segment splat when splitting gso_size mangled skb having linear-headed frag_list
d7707635776b sk_buff: allow segmenting based on frag sizes
924bbacea75e ip_tunnel, bpf: ip_tunnel_info_opts_{get, set} depends on CONFIG_INET
0e9008d618f4 bpf: udp: ipv6: Avoid running reuseport's bpf_prog from __udp6_lib_err
01b437940f5e soreuseport: add compat case for setsockopt SO_ATTACH_REUSEPORT_CBPF
421fbf04bf2c soreuseport: change consume_skb to kfree_skb in error case
1ab50514c430 ipv6: Fix SO_REUSEPORT UDP socket with implicit sk_ipv6only
f3dfd61c502d soreuseport: fix ordering for mixed v4/v6 sockets
245ee3c90795 soreuseport: fix NULL ptr dereference SO_REUSEPORT after bind
113fb209854a bpf: do not blindly change rlimit in reuseport net selftest
985253ef27d2 bpf: fix rlimit in reuseport net selftest
ae61334510be soreuseport: Fix reuseport_bpf testcase on 32bit architectures
6efa24da01a5 udp: fix potential infinite loop in SO_REUSEPORT logic
66df70c6605d soreuseport: BPF selection functional test for TCP
fe161031b8a8 soreuseport: pass skb to secondary UDP socket lookup
9223919efdf2 soreuseport: BPF selection functional test
2090ed790dbb soreuseport: fix mem leak in reuseport_add_sock()
67887f6ac3f1 Merge "diag: Ensure dci entry is valid before sending the packet"
e41c0da23b38 diag: Prevent out of bound write while sending dci pkt to remote
e1085d1ef39b diag: Ensure dci entry is valid before sending the packet
16802e80ecb5 Merge "ion: Fix integer overflow in msm_ion_custom_ioctl"
57146f83f388 ion: Fix integer overflow in msm_ion_custom_ioctl
6fc2001969fe diag: Use valid data_source for a valid token
0c6dbf858a98 qcacld-3.0: Avoid OOB read in dot11f_unpack_assoc_response
f07caca0c485 qcacld-3.0: Fix array OOB for duplicate rate
5a359aba0364 msm: kgsl: Remove 'fd' dependency to get dma_buf handle
da8317596949 msm: kgsl: Fix gpuaddr_in_range() to check upper bound
2ed91a98d8b4 msm: adsprpc: Handle UAF in fastrpc debugfs read
2967159ad303 msm: kgsl: Add a sysfs node to control performance counter reads
e392a84f25f5 msm: kgsl: Perform cache flush on the pages obtained using get_user_pages()
28b45f75d2ee soc: qcom: hab: Add sanity check for payload_count
885caec7690f Merge "futex: Fix inode life-time issue"
0f57701d2643 Merge "futex: Handle faults correctly for PI futexes"
7d7eb450c333 Merge "futex: Rework inconsistent rt_mutex/futex_q state"
124ebd87ef2f msm: kgsl: Fix out of bound write in adreno_profile_submit_time
228bbfb25032 futex: Fix inode life-time issue
7075ca6a22b3 futex: Handle faults correctly for PI futexes
a436b73e9032 futex: Simplify fixup_pi_state_owner()
11b99dbe3221 futex: Use pi_state_update_owner() in put_pi_state()
f34484030550 rtmutex: Remove unused argument from rt_mutex_proxy_unlock()
079d1c90b3c3 futex: Provide and use pi_state_update_owner()
3b51e24eb17b futex: Replace pointless printk in fixup_owner()
0eac5c2583a1 futex: Avoid violating the 10th rule of futex
6d6ed38b7d10 futex: Rework inconsistent rt_mutex/futex_q state
3c8f7dfd59b5 futex: Remove rt_mutex_deadlock_account_*()
9c870a329520 futex,rt_mutex: Provide futex specific rt_mutex API
7504736e8725 msm: adsprpc: Handle UAF in process shell memory
994e5922a0c2 Disable TRACER Check to improve Camera Performance
8fb3f17b3ad1 msm: kgsl: Deregister gpu address on memdesc_sg_virt failure
13aa628efdca Merge "crypto: Fix possible stack out-of-bound error"
92e777451003 Merge "msm: kgsl: Correct the refcount on current process PID."
9ca218394ed4 Merge "msm: kgsl: Compare pid pointer instead of TGID for a new process"
7eed1f2e0f43 Merge "qcom,max-freq-level change for trial"
6afb5eb98e36 crypto: Fix possible stack out-of-bound error
8b5ba278ed4b msm: kgsl: Correct the refcount on current process PID.
4150552fac96 msm: kgsl: Compare pid pointer instead of TGID for a new process
c272102c0793 qcom,max-freq-level change for trial
854ef3ce73f5 msm: kgsl: Protect the memdesc->gpuaddr in SVM use cases.
79c8161aeac9 msm: kgsl: Stop using memdesc->usermem.
Change-Id: Iea7db1362c3cd18e36f243411e773a9054f6a445
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/cgroup.c | 33 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 24 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 57 | ||||
| -rw-r--r-- | kernel/cgroup.c | 31 | ||||
| -rw-r--r-- | kernel/exit.c | 1 | ||||
| -rw-r--r-- | kernel/fork.c | 136 | ||||
| -rw-r--r-- | kernel/pid.c | 74 | ||||
| -rw-r--r-- | kernel/signal.c | 142 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 17 | ||||
| -rw-r--r-- | kernel/trace/trace.c | 4 |
10 files changed, 482 insertions, 37 deletions
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 357ce8355d57..8210c7dd7532 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -424,3 +424,36 @@ int __cgroup_bpf_run_filter(struct sock *sk, return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter); + +/** + * __cgroup_bpf_run_filter_sk() - Run a program on a sock + * @sk: sock structure to manipulate + * @type: The type of program to be exectuted + * + * socket is passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]->progs[0]); + if (prog) + ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 95ffe1fac0bf..2b1a925489cf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -60,11 +60,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns { u8 *ptr = NULL; - if (k >= SKF_NET_OFF) + if (k >= SKF_NET_OFF) { ptr = skb_network_header(skb) + k - SKF_NET_OFF; - else if (k >= SKF_LL_OFF) + } else if (k >= SKF_LL_OFF) { + if (unlikely(!skb_mac_header_was_set(skb))) + return NULL; ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - + } if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; @@ -105,19 +107,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; + u32 pages, delta; + int ret; BUG_ON(fp_old == NULL); size = round_up(size, PAGE_SIZE); - if (size <= fp_old->pages * PAGE_SIZE) + pages = size / PAGE_SIZE; + if (pages <= fp_old->pages) return fp_old; + delta = pages - fp_old->pages; + ret = __bpf_prog_charge(fp_old->aux->user, delta); + if (ret) + return NULL; + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); - if (fp != NULL) { + if (fp == NULL) { + __bpf_prog_uncharge(fp_old->aux->user, delta); + } else { kmemcheck_annotate_bitfield(fp, meta); memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); - fp->pages = size / PAGE_SIZE; + fp->pages = pages; fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a73a056ccd88..f65bd2dc07f8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -652,19 +652,39 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +int __bpf_prog_charge(struct user_struct *user, u32 pages) +{ + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + unsigned long user_bufs; + + if (user) { + user_bufs = atomic_long_add_return(pages, &user->locked_vm); + if (user_bufs > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); + return -EPERM; + } + } + + return 0; +} + +void __bpf_prog_uncharge(struct user_struct *user, u32 pages) +{ + if (user) + atomic_long_sub(pages, &user->locked_vm); +} + static int bpf_prog_charge_memlock(struct bpf_prog *prog) { struct user_struct *user = get_current_user(); - unsigned long memlock_limit; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + int ret; - atomic_long_add(prog->pages, &user->locked_vm); - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(prog->pages, &user->locked_vm); + ret = __bpf_prog_charge(user, prog->pages); + if (ret) { free_uid(user); - return -EPERM; + return ret; } + prog->aux->user = user; return 0; } @@ -673,7 +693,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) { struct user_struct *user = prog->aux->user; - atomic_long_sub(prog->pages, &user->locked_vm); + __bpf_prog_uncharge(user, prog->pages); free_uid(user); } @@ -933,7 +953,24 @@ static int bpf_prog_attach(const union bpf_attr *attr) bpf_prog_put(prog); cgroup_put(cgrp); break; + case BPF_CGROUP_INET_SOCK_CREATE: + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_CGROUP_SOCK); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); + if (ret) + bpf_prog_put(prog); + cgroup_put(cgrp); + break; default: return -EINVAL; } @@ -961,7 +998,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_EGRESS: ptype = BPF_PROG_TYPE_CGROUP_SKB; break; - + case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; default: return -EINVAL; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d36c4f914a1e..c2508ca442b7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1544,6 +1544,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, i, ret; + u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1560,8 +1561,28 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; + + /* + * Collect ssid's that need to be disabled from default + * hierarchy. + */ + if (ss->root == &cgrp_dfl_root) + dfl_disable_ss_mask |= 1 << ssid; + } while_each_subsys_mask(); + if (dfl_disable_ss_mask) { + struct cgroup *scgrp = &cgrp_dfl_root.cgrp; + + /* + * Controllers from default hierarchy that need to be rebound + * are all disabled together in one go. + */ + cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1570,10 +1591,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) WARN_ON(!css || cgroup_css(dcgrp, ss)); - /* disable from the source */ - src_root->subsys_mask &= ~(1 << ssid); - WARN_ON(cgroup_apply_control(scgrp)); - cgroup_finalize_control(scgrp, 0); + if (src_root != &cgrp_dfl_root) { + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); diff --git a/kernel/exit.c b/kernel/exit.c index babbc3c0a181..052aa52de331 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -616,6 +616,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); + tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && diff --git a/kernel/fork.c b/kernel/fork.c index 92a0df862115..a21adc0155b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -11,6 +11,7 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ +#include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/unistd.h> @@ -38,6 +39,7 @@ #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/seccomp.h> +#include <linux/seq_file.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> @@ -1294,6 +1296,84 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) task->pids[type].pid = pid; } +static int pidfd_release(struct inode *inode, struct file *file) +{ + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); + return 0; +} + +#ifdef CONFIG_PROC_FS +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid_namespace *ns = file_inode(m->file)->i_sb->s_fs_info; + struct pid *pid = f->private_data; + + seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); + seq_putc(m, '\n'); +} +#endif + +/* + * Poll support for process exit notification. + */ +static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) +{ + struct task_struct *task; + struct pid *pid = file->private_data; + int poll_flags = 0; + + poll_wait(file, &pid->wait_pidfd, pts); + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + /* + * Inform pollers only when the whole thread group exits. + * If the thread group leader exits before all other threads in the + * group, then poll(2) should block, similar to the wait(2) family. + */ + if (!task || (task->exit_state && thread_group_empty(task))) + poll_flags = POLLIN | POLLRDNORM; + rcu_read_unlock(); + + return poll_flags; +} + +const struct file_operations pidfd_fops = { + .release = pidfd_release, + .poll = pidfd_poll, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1305,13 +1385,14 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, + int __user *parent_tidptr, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node) { - int retval; + int pidfd = -1, retval; struct task_struct *p; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) @@ -1356,6 +1437,31 @@ static struct task_struct *copy_process(unsigned long clone_flags, return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_PIDFD) { + int reserved; + + /* + * - CLONE_PARENT_SETTID is useless for pidfds and also + * parent_tidptr is used to return pidfds. + * - CLONE_DETACHED is blocked so that we can potentially + * reuse it later for CLONE_PIDFD. + * - CLONE_THREAD is blocked until someone really needs it. + */ + if (clone_flags & + (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) + return ERR_PTR(-EINVAL); + + /* + * Verify that parent_tidptr is sane so we can potentially + * reuse it later. + */ + if (get_user(reserved, parent_tidptr)) + return ERR_PTR(-EFAULT); + + if (reserved != 0) + return ERR_PTR(-EINVAL); + } + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -1538,6 +1644,22 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } + /* + * This has to happen after we've potentially unshared the file + * descriptor table (so that the pidfd doesn't leak into the child + * if the fd table isn't shared). + */ + if (clone_flags & CLONE_PIDFD) { + retval = pidfd_create(pid); + if (retval < 0) + goto bad_fork_free_pid; + + pidfd = retval; + retval = put_user(pidfd, parent_tidptr); + if (retval) + goto bad_fork_put_pidfd; + } + #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1587,7 +1709,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_free_pid; + goto bad_fork_cgroup_threadgroup_change_end; /* * From this point on we must avoid any synchronous user-space @@ -1698,8 +1820,12 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); +bad_fork_cgroup_threadgroup_change_end: + cgroup_threadgroup_change_end(current); +bad_fork_put_pidfd: + if (clone_flags & CLONE_PIDFD) + sys_close(pidfd); bad_fork_free_pid: - threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: @@ -1754,7 +1880,7 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task->pids); @@ -1799,7 +1925,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, + p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); /* * Do this prior waking up the new thread - the thread pointer diff --git a/kernel/pid.c b/kernel/pid.c index ccfdb56321c6..9bc06f3a2b54 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -38,6 +38,8 @@ #include <linux/syscalls.h> #include <linux/proc_ns.h> #include <linux/proc_fs.h> +#include <linux/anon_inodes.h> +//#include <linux/sched/signal.h> #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -331,6 +333,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) @@ -564,6 +568,76 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return pid; } +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + +/** + * pidfd_open() - Open new pid file descriptor. + * + * @pid: pid for which to retrieve a pidfd + * @flags: flags to pass + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set for + * the process identified by @pid. Currently, the process identified by + * @pid must be a thread-group leader. This restriction currently exists + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot + * be used with CLONE_THREAD) and pidfd polling (only supports thread group + * leaders). + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) +{ + int fd, ret; + struct pid *p; + struct task_struct *tsk; + + if (flags) + return -EINVAL; + + if (pid <= 0) + return -EINVAL; + + p = find_get_pid(pid); + if (!p) + return -ESRCH; + + ret = 0; + rcu_read_lock(); + tsk = pid_task(p, PIDTYPE_PID); + /* Check that pid belongs to a group leader task */ + if (!tsk || !thread_group_leader(tsk)) + ret = -EINVAL; + rcu_read_unlock(); + + fd = ret ?: pidfd_create(p); + put_pid(p); + return fd; +} + /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or diff --git a/kernel/signal.c b/kernel/signal.c index a699055ebfe8..a9697e189a58 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -14,7 +14,9 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/file.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> @@ -1632,6 +1634,14 @@ ret: return ret; } +static void do_notify_pidfd(struct task_struct *task) +{ + struct pid *pid; + + pid = task_pid(task); + wake_up_all(&pid->wait_pidfd); +} + /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. @@ -1655,6 +1665,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + /* Wake up all pidfd waiters */ + do_notify_pidfd(tsk); + if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. @@ -2922,6 +2935,15 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +static inline void prepare_kill_siginfo(int sig, struct siginfo *info) +{ + info->si_signo = sig; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_tgid_vnr(current); + info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); +} + /** * sys_kill - send a signal to a process * @pid: the PID of the process @@ -2931,15 +2953,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info); return kill_something_info(sig, &info, pid); } +/* + * Verify that the signaler and signalee either are in the same pid namespace + * or that the signaler's pid namespace is an ancestor of the signalee's pid + * namespace. + */ +static bool access_pidfd_pidns(struct pid *pid) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *p = ns_of_pid(pid); + + for (;;) { + if (!p) + return false; + if (p == active) + break; + p = p->parent; + } + + return true; +} + +static struct pid *pidfd_to_pid(const struct file *file) +{ + if (file->f_op == &pidfd_fops) + return file->private_data; + + return tgid_pidfd_to_pid(file); +} + +static int copy_siginfo_from_user_any(siginfo_t *kinfo, siginfo_t __user *info) +{ +#ifdef CONFIG_COMPAT + /* + * Avoid hooking up compat syscalls and instead handle necessary + * conversions here. Note, this is a stop-gap measure and should not be + * considered a generic solution. + */ + if (in_compat_syscall()) + return copy_siginfo_from_user32( + kinfo, (struct compat_siginfo __user *)info); +#endif + return copy_from_user(kinfo, info, sizeof(siginfo_t)); +} + +/** + * sys_pidfd_send_signal - Signal a process through a pidfd + * @pidfd: file descriptor of the process + * @sig: signal to send + * @info: signal info + * @flags: future flags + * + * The syscall currently only signals via PIDTYPE_PID which covers + * kill(<positive-pid>, <signal>. It does not signal threads or process + * groups. + * In order to extend the syscall to threads and process groups the @flags + * argument should be used. In essence, the @flags argument will determine + * what is signaled and not the file descriptor itself. Put in other words, + * grouping is a property of the flags argument not a property of the file + * descriptor. + * + * Return: 0 on success, negative errno on failure + */ +SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, + siginfo_t __user *, info, unsigned int, flags) +{ + int ret; + struct fd f; + struct pid *pid; + siginfo_t kinfo; + + /* Enforce flags be set to 0 until we add an extension. */ + if (flags) + return -EINVAL; + + f = fdget(pidfd); + if (!f.file) + return -EBADF; + + /* Is this a pidfd? */ + pid = pidfd_to_pid(f.file); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto err; + } + + ret = -EINVAL; + if (!access_pidfd_pidns(pid)) + goto err; + + if (info) { + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + goto err; + + ret = -EINVAL; + if (unlikely(sig != kinfo.si_signo)) + goto err; + + /* Only allow sending arbitrary signals to yourself. */ + ret = -EPERM; + if ((task_pid(current) != pid) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) + goto err; + } else { + prepare_kill_siginfo(sig, &kinfo); + } + + ret = kill_pid_info(sig, &kinfo, pid); + +err: + fdput(f); + return ret; +} + static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c5484723abda..74fa302d9a68 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -128,8 +128,9 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs); #else /* See comment below, where ftrace_ops_list_func is defined */ -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); -#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); +#define ftrace_ops_list_func ftrace_ops_no_ops #endif /* @@ -5229,7 +5230,8 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, } NOKPROBE_SYMBOL(ftrace_ops_list_func); #else -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } @@ -5677,14 +5679,17 @@ void ftrace_graph_graph_time_control(bool enable) fgraph_graph_time = enable; } +void ftrace_graph_return_stub(struct ftrace_graph_ret *trace) +{ +} + int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { return 0; } /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_return_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -5912,7 +5917,7 @@ void unregister_ftrace_graph(void) goto out; ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_return = ftrace_graph_return_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a67f792fb950..17996354c745 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7026,7 +7026,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) } -static struct vfsmount *trace_automount(void *ingore) +static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; @@ -7039,7 +7039,7 @@ static struct vfsmount *trace_automount(void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + mnt = vfs_submount(mntpt, type, "tracefs", NULL); put_filesystem(type); if (IS_ERR(mnt)) return NULL; |
