diff options
Diffstat (limited to 'kernel/bpf/memalloc.c')
-rw-r--r-- | kernel/bpf/memalloc.c | 64 |
1 files changed, 53 insertions, 11 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5f83be1d2018..ebcc3dd0fa19 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -171,9 +171,24 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); for (i = 0; i < cnt; i++) { - obj = __alloc(c, node); - if (!obj) - break; + /* + * free_by_rcu is only manipulated by irq work refill_work(). + * IRQ works on the same CPU are called sequentially, so it is + * safe to use __llist_del_first() here. If alloc_bulk() is + * invoked by the initial prefill, there will be no running + * refill_work(), so __llist_del_first() is fine as well. + * + * In most cases, objects on free_by_rcu are from the same CPU. + * If some objects come from other CPUs, it doesn't incur any + * harm because NUMA_NO_NODE means the preference for current + * numa node and it is not a guarantee. + */ + obj = __llist_del_first(&c->free_by_rcu); + if (!obj) { + obj = __alloc(c, node); + if (!obj) + break; + } if (IS_ENABLED(CONFIG_PREEMPT_RT)) /* In RT irq_work runs in per-cpu kthread, so disable * interrupts to avoid preemption and interrupts and @@ -222,9 +237,13 @@ static void __free_rcu(struct rcu_head *head) static void __free_rcu_tasks_trace(struct rcu_head *head) { - struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); - - call_rcu(&c->rcu, __free_rcu); + /* If RCU Tasks Trace grace period implies RCU grace period, + * there is no need to invoke call_rcu(). + */ + if (rcu_trace_implies_rcu_gp()) + __free_rcu(head); + else + call_rcu(head, __free_rcu); } static void enque_to_free(struct bpf_mem_cache *c, void *obj) @@ -253,8 +272,9 @@ static void do_call_rcu(struct bpf_mem_cache *c) */ __llist_add(llnode, &c->waiting_for_gp); /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. - * Then use call_rcu() to wait for normal progs to finish - * and finally do free_one() on each element. + * If RCU Tasks Trace grace period implies RCU grace period, free + * these elements directly, else use call_rcu() to wait for normal + * progs to finish and finally do free_one() on each element. */ call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace); } @@ -418,14 +438,17 @@ static void drain_mem_cache(struct bpf_mem_cache *c) /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in * free_by_rcu or in waiting_for_gp lists, so drain those lists now. + * + * Except for waiting_for_gp list, there are no concurrent operations + * on these lists, so it is safe to use __llist_del_all(). */ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) free_one(c, llnode); llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp)) free_one(c, llnode); - llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist)) + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist)) free_one(c, llnode); - llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra)) free_one(c, llnode); } @@ -441,9 +464,17 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) { /* waiting_for_gp lists was drained, but __free_rcu might * still execute. Wait for it now before we freeing percpu caches. + * + * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), + * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used + * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), + * so if call_rcu(head, __free_rcu) is skipped due to + * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by + * using rcu_trace_implies_rcu_gp() as well. */ rcu_barrier_tasks_trace(); - rcu_barrier(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); free_mem_alloc_no_barrier(ma); } @@ -493,6 +524,16 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress = 0; for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); + /* + * refill_work may be unfinished for PREEMPT_RT kernel + * in which irq work is invoked in a per-CPU RT thread. + * It is also possible for kernel with + * arch_irq_work_has_interrupt() being false and irq + * work is invoked in timer interrupt. So waiting for + * the completion of irq work to ease the handling of + * concurrency. + */ + irq_work_sync(&c->refill_work); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } @@ -507,6 +548,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; + irq_work_sync(&c->refill_work); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } |