summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c81
-rw-r--r--kernel/events/core.c26
-rw-r--r--kernel/trace/bpf_trace.c82
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_syscalls.c8
-rw-r--r--kernel/trace/trace_uprobe.c3
6 files changed, 175 insertions, 31 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d08285b32d8c..c459c2533271 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1060,6 +1060,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+static unsigned int __bpf_prog_ret1(const struct sk_buff *ctx,
+ const struct bpf_insn *insn)
+{
+ return 1;
+}
+
+static struct bpf_prog_dummy {
+ struct bpf_prog prog;
+} dummy_bpf_prog = {
+ .prog = {
+ .bpf_func = __bpf_prog_ret1,
+ },
+};
+
/* to avoid allocating empty bpf_prog_array for cgroups that
* don't have bpf program attached use one global 'empty_prog_array'
* It will not be modified the caller of bpf_prog_array_alloc()
@@ -1091,6 +1105,73 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
kfree_rcu(progs, rcu);
}
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_prog **prog = progs->progs;
+
+ for (; *prog; prog++)
+ if (*prog == old_prog) {
+ WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+ break;
+ }
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+ struct bpf_prog *exclude_prog,
+ struct bpf_prog *include_prog,
+ struct bpf_prog_array **new_array)
+{
+ int new_prog_cnt, carry_prog_cnt = 0;
+ struct bpf_prog **existing_prog;
+ struct bpf_prog_array *array;
+ int new_prog_idx = 0;
+
+ /* Figure out how many existing progs we need to carry over to
+ * the new array.
+ */
+ if (old_array) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++) {
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ carry_prog_cnt++;
+ if (*existing_prog == include_prog)
+ return -EEXIST;
+ }
+ }
+
+ /* How many progs (not NULL) will be in the new array? */
+ new_prog_cnt = carry_prog_cnt;
+ if (include_prog)
+ new_prog_cnt += 1;
+
+ /* Do we have any prog (not NULL) in the new array? */
+ if (!new_prog_cnt) {
+ *new_array = NULL;
+ return 0;
+ }
+
+ /* +1 as the end of prog_array is marked with NULL */
+ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ /* Fill in the new prog array */
+ if (carry_prog_cnt) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++)
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ array->progs[new_prog_idx++] = *existing_prog;
+ }
+ if (include_prog)
+ array->progs[new_prog_idx++] = include_prog;
+ array->progs[new_prog_idx] = NULL;
+ *new_array = array;
+ return 0;
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3c03da137864..470cb4cb8f74 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7203,11 +7203,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct pt_regs *regs, struct hlist_head *head,
struct task_struct *task)
{
- struct bpf_prog *prog = call->prog;
-
- if (prog) {
+ if (bpf_prog_array_valid(call)) {
*(struct pt_regs **)raw_data = regs;
- if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+ if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
@@ -7410,6 +7408,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
+ int ret;
if (event->attr.type == PERF_TYPE_HARDWARE ||
event->attr.type == PERF_TYPE_SOFTWARE)
@@ -7418,9 +7417,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
- if (event->tp_event->prog)
- return -EEXIST;
-
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
if (!is_kprobe && !is_tracepoint)
@@ -7446,26 +7442,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return -EACCES;
}
}
- event->tp_event->prog = prog;
- event->tp_event->bpf_prog_owner = event;
- return 0;
+ ret = perf_event_attach_bpf_prog(event, prog);
+ if (ret)
+ bpf_prog_put(prog);
+ return ret;
}
static void perf_event_free_bpf_prog(struct perf_event *event)
{
- struct bpf_prog *prog;
-
if (event->attr.type != PERF_TYPE_TRACEPOINT) {
perf_event_free_bpf_handler(event);
return;
}
-
- prog = event->tp_event->prog;
- if (prog && event->tp_event->bpf_prog_owner == event) {
- event->tp_event->prog = NULL;
- bpf_prog_put(prog);
- }
+ perf_event_detach_bpf_prog(event);
}
#else
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a1cd54a16a99..045bde1a64ac 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,7 +17,7 @@
/**
* trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
* @ctx: opaque context pointer
*
* kprobe handlers execute BPF programs via this helper.
@@ -29,7 +29,7 @@
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
@@ -49,9 +49,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
goto out;
}
- rcu_read_lock();
- ret = BPF_PROG_RUN(prog, ctx);
- rcu_read_unlock();
+ /*
+ * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+ * to all call sites, we did a bpf_prog_array_valid() there to check
+ * whether call->prog_array is empty or not, which is
+ * a heurisitc to speed up execution.
+ *
+ * If bpf_prog_array_valid() fetched prog_array was
+ * non-NULL, we go into trace_call_bpf() and do the actual
+ * proper rcu_dereference() under RCU lock.
+ * If it turns out that prog_array is NULL then, we bail out.
+ * For the opposite, if the bpf_prog_array_valid() fetched pointer
+ * was NULL, you'll skip the prog_array with the risk of missing
+ * out of events when it was updated in between this and the
+ * rcu_dereference() which is accepted risk.
+ */
+ ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
out:
__this_cpu_dec(bpf_prog_active);
@@ -604,6 +617,65 @@ static const struct bpf_verifier_ops perf_event_prog_ops = {
.convert_ctx_access = pe_prog_convert_ctx_access,
};
+static DEFINE_MUTEX(bpf_event_mutex);
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret = -EEXIST;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (event->prog)
+ goto out;
+
+ old_array = rcu_dereference_protected(event->tp_event->prog_array,
+ lockdep_is_held(&bpf_event_mutex));
+ ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+ if (ret < 0)
+ goto out;
+
+ /* set the new array to event->tp_event and set event->prog */
+ event->prog = prog;
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+
+out:
+ mutex_unlock(&bpf_event_mutex);
+ return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (!event->prog)
+ goto out;
+
+ old_array = rcu_dereference_protected(event->tp_event->prog_array,
+ lockdep_is_held(&bpf_event_mutex));
+
+ ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ if (ret < 0) {
+ bpf_prog_array_delete_safe(old_array, event->prog);
+ } else {
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+ }
+
+ bpf_prog_put(event->prog);
+ event->prog = NULL;
+
+out:
+ mutex_unlock(&bpf_event_mutex);
+}
+
static struct bpf_prog_type_list perf_event_tl = {
.ops = &perf_event_prog_ops,
.type = BPF_PROG_TYPE_PERF_EVENT,
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 975638e51be5..5db1a3eec84d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1129,13 +1129,12 @@ static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
@@ -1165,13 +1164,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d7cfb6a673e3..e8540540ecab 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -551,6 +551,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -566,7 +567,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- if (hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
@@ -626,6 +628,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -641,7 +644,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
return;
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- if (hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* We can probably do that at build time */
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index e77dfa1e7db7..d86cc7d768f1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1119,13 +1119,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
{
struct trace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
- struct bpf_prog *prog = call->prog;
struct hlist_head *head;
void *data;
int size, esize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));