diff options
Diffstat (limited to 'kernel')
40 files changed, 378 insertions, 253 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..c1b9f71ee6aa 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o -CFLAGS_core.o += $(call cc-disable-warning, override-init) +ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +endif +CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27f..56cc5a915f67 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -13,6 +13,7 @@ #include <linux/bpf_verifier.h> #include <net/bpf_sk_storage.h> #include <linux/bpf_local_storage.h> +#include <linux/btf_ids.h> /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -26,7 +27,11 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ #include <linux/lsm_hook_defs.h> #undef LSM_HOOK -#define BPF_LSM_SYM_PREFX "bpf_lsm_" +#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +BTF_SET_START(bpf_lsm_hooks) +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +BTF_SET_END(bpf_lsm_hooks) int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) @@ -37,8 +42,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EINVAL; } - if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name, - sizeof(BPF_LSM_SYM_PREFX) - 1)) { + if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) { bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", prog->aux->attach_btf_id, prog->aux->attach_func_name); return -EINVAL; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9268d77898b7..55454d2278b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1369,7 +1369,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * * Decode and execute eBPF instructions. */ -static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c..1fccba6e88c4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -821,6 +821,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + /* When using prealloc and not setting the initial value on all cpus, + * zero-fill element values for other cpus (just as what happens when + * not using prealloc). Otherwise, bpf program has no way to ensure + * known initial values for cpus other than current one + * (onallcpus=false always when coming from bpf prog). + */ + if (htab_is_prealloc(htab) && !onallcpus) { + u32 size = round_up(htab->map.value_size, 8); + int current_cpu = raw_smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == current_cpu) + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, + size); + else + memset(per_cpu_ptr(pptr, cpu), 0, size); + } + } else { + pcpu_copy_value(htab, pptr, value, onallcpus); + } +} + static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && @@ -891,7 +917,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1183,7 +1209,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { - pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index ace49111d3a3..26bced262473 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -6,6 +6,7 @@ config USERMODE_DRIVER menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF + depends on BPF_SYSCALL # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b4eea0abc3f0..781b9dca197c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -229,6 +229,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; if (verbose) swiotlb_print_info(); @@ -260,9 +261,11 @@ swiotlb_init(int verbose) if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; - if (io_tlb_start) + if (io_tlb_start) { memblock_free_early(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + io_tlb_start = 0; + } pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; } @@ -360,6 +363,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; swiotlb_print_info(); @@ -441,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs) +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) { + dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); unsigned long flags; phys_addr_t tlb_addr; unsigned int nslots, stride, index, wrap; @@ -667,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, swiotlb_force); - swiotlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, size, dir, attrs); + swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir, + attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 2b8366693d5c..e9e2df3f3f9e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -337,10 +337,10 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * already contains a warning when RCU is not watching, so no point * in having another one here. */ + lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); rcu_irq_enter_check_tick(); - /* Use the combo lockdep/tracing function */ - trace_hardirqs_off(); + trace_hardirqs_off_finish(); instrumentation_end(); return ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index da467e1dd49a..dc568ca295bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2312,9 +2312,6 @@ group_sched_out(struct perf_event *group_event, event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); - - if (group_event->attr.exclusive) - cpuctx->exclusive = 0; } #define DETACH_GROUP 0x01UL @@ -2583,11 +2580,8 @@ group_sched_in(struct perf_event *group_event, pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) { - pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; - } + if (event_sched_in(group_event, cpuctx, ctx)) + goto error; /* * Schedule in siblings as one group (if any): @@ -2616,10 +2610,8 @@ group_error: } event_sched_out(group_event, cpuctx, ctx); +error: pmu->cancel_txn(pmu); - - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; } @@ -2645,7 +2637,7 @@ static int group_can_go_on(struct perf_event *event, * If this group is exclusive and there are already * events on the CPU, it can't go on. */ - if (event->attr.exclusive && cpuctx->active_oncpu) + if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -3679,6 +3671,7 @@ static int merge_sched_in(struct perf_event *event, void *data) *can_add_hw = 0; ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); } return 0; @@ -6374,14 +6367,13 @@ perf_output_sample_regs(struct perf_output_handle *handle, } static void perf_sample_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { - perf_get_regs_user(regs_user, regs, regs_user_copy); + perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; @@ -7083,8 +7075,7 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) - perf_sample_regs_user(&data->regs_user, regs, - &data->regs_user_copy); + perf_sample_regs_user(&data->regs_user, regs); if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ @@ -7186,6 +7177,7 @@ __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, + struct perf_sample_data *, struct perf_event *, unsigned int)) { @@ -7198,7 +7190,7 @@ __perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - err = output_begin(&handle, event, header.size); + err = output_begin(&handle, data, event, header.size); if (err) goto exit; @@ -7264,7 +7256,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size); + ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; @@ -7533,7 +7525,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; @@ -7636,7 +7628,7 @@ static void perf_event_comm_output(struct perf_event *event, return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) @@ -7736,7 +7728,7 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; @@ -7863,7 +7855,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data) perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; @@ -7989,7 +7981,7 @@ static void perf_event_mmap_output(struct perf_event *event, } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; @@ -8299,7 +8291,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, int ret; perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; @@ -8333,7 +8325,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) perf_event_header__init_id(&lost_samples_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; @@ -8388,7 +8380,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data) perf_event_header__init_id(&se->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, se->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; @@ -8463,7 +8455,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; @@ -8506,7 +8498,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data) perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; @@ -8596,7 +8588,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data) perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, data, event, bpf_event->event_id.header.size); if (ret) return; @@ -8705,7 +8697,8 @@ static void perf_event_text_poke_output(struct perf_event *event, void *data) perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, + text_poke_event->event_id.header.size); if (ret) return; @@ -8786,7 +8779,7 @@ static void perf_log_itrace_start(struct perf_event *event) rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; @@ -10085,6 +10078,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; + kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; @@ -10131,16 +10125,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ ret = -EOPNOTSUPP; if (!event->ctx->task) - goto fail_free_name; + goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) - goto fail_free_name; - - kfree(filename); - filename = NULL; + goto fail; ret = -EINVAL; if (!filter->path.dentry || @@ -10160,13 +10151,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (state != IF_STATE_ACTION) goto fail; + kfree(filename); kfree(orig); return 0; -fail_free_name: - kfree(filename); fail: + kfree(filename); free_filters_list(filters); kfree(orig); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index fcbf5616a441..228801e20788 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -205,16 +205,12 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { - int rctx; - - if (unlikely(in_nmi())) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; + unsigned int pc = preempt_count(); + unsigned char rctx = 0; + + rctx += !!(pc & (NMI_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); if (recursion[rctx]) return -1; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 192b8abc6330..ef91ae75ca56 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail, static __always_inline int __perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size, bool backward) { @@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_handle *handle, handle->size = (1UL << page_shift) - offset; if (unlikely(have_lost)) { - struct perf_sample_data sample_data; - lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); - perf_event_header__init_id(&lost_event.header, - &sample_data, event); + /* XXX mostly redundant; @data is already fully initializes */ + perf_event_header__init_id(&lost_event.header, data, event); perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); + perf_event__output_id_sample(event, handle, data); } return 0; @@ -263,22 +262,25 @@ out: } int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) + struct perf_sample_data *data, + struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, false); + return __perf_output_begin(handle, data, event, size, false); } int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, true); + return __perf_output_begin(handle, data, event, size, true); } int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, + return __perf_output_begin(handle, data, event, size, unlikely(is_write_backward(event))); } diff --git a/kernel/exit.c b/kernel/exit.c index 87a2d515de0d..1f236ed375f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -454,7 +454,10 @@ static void exit_mm(void) mmap_read_unlock(mm); self.task = current; - self.next = xchg(&core_state->dumper.next, &self); + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..6d266388d380 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2167,14 +2167,9 @@ static __latent_entropy struct task_struct *copy_process( /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { - p->exit_signal = -1; p->group_leader = current->group_leader; p->tgid = current->tgid; } else { - if (clone_flags & CLONE_PARENT) - p->exit_signal = current->group_leader->exit_signal; - else - p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2218,9 +2213,14 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else + p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; + p->exit_signal = args->exit_signal; } klp_copy_process(p); diff --git a/kernel/futex.c b/kernel/futex.c index be68ac0d49ad..00259c7e288e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -788,8 +788,9 @@ static void put_pi_state(struct futex_pi_state *pi_state) */ if (pi_state->owner) { struct task_struct *owner; + unsigned long flags; - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); owner = pi_state->owner; if (owner) { raw_spin_lock(&owner->pi_lock); @@ -797,7 +798,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) raw_spin_unlock(&owner->pi_lock); } rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } if (current->pi_state_cache) { @@ -1503,8 +1504,10 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (unlikely(should_fail_futex(true))) + if (unlikely(should_fail_futex(true))) { ret = -EFAULT; + goto out_unlock; + } ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); if (!ret && (curval != uval)) { @@ -2378,10 +2381,22 @@ retry: } /* - * Since we just failed the trylock; there must be an owner. + * The trylock just failed, so either there is an owner or + * there is a higher priority waiter than this one. */ newowner = rt_mutex_owner(&pi_state->pi_mutex); - BUG_ON(!newowner); + /* + * If the higher priority waiter has not yet taken over the + * rtmutex then newowner is NULL. We can't return here with + * that state because it's inconsistent vs. the user space + * state. So drop the locks and try again. It's a valid + * situation and not any different from the other retry + * conditions. + */ + if (unlikely(!newowner)) { + err = -EAGAIN; + goto handle_err; + } } else { WARN_ON_ONCE(argowner != current); if (oldowner == current) { diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ce76f490126c..396ebaebea3f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -225,8 +225,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 10a5aff4eecc..164a031cfdb6 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -82,6 +82,7 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS # Generic IRQ IPI support config GENERIC_IRQ_IPI bool + select IRQ_DOMAIN_HIERARCHY # Generic MSI interrupt support config GENERIC_MSI_IRQ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8a12a25fa40d..41fdbb7953c6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1249,7 +1249,13 @@ __acquires(hlist_lock) *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); + /* + * Nested is a workaround that will soon not be needed. + * There's other protections that make sure the same lock + * is not taken on the same CPU that lockdep is unaware of. + * Differentiate when it is taken in NMI context. + */ + raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi()); } NOKPROBE_SYMBOL(kretprobe_hash_lock); @@ -1258,7 +1264,13 @@ static void kretprobe_table_lock(unsigned long hash, __acquires(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); + /* + * Nested is a workaround that will soon not be needed. + * There's other protections that make sure the same lock + * is not taken on the same CPU that lockdep is unaware of. + * Differentiate when it is taken in NMI context. + */ + raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi()); } NOKPROBE_SYMBOL(kretprobe_table_lock); @@ -2028,7 +2040,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - raw_spin_lock_irqsave(&rp->lock, flags); + /* + * Nested is a workaround that will soon not be needed. + * There's other protections that make sure the same lock + * is not taken on the same CPU that lockdep is unaware of. + */ + raw_spin_lock_irqsave_nested(&rp->lock, flags, 1); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); @@ -2039,7 +2056,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) ri->task = current; if (rp->entry_handler && rp->entry_handler(ri, regs)) { - raw_spin_lock_irqsave(&rp->lock, flags); + raw_spin_lock_irqsave_nested(&rp->lock, flags, 1); hlist_add_head(&ri->hlist, &rp->free_instances); raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; diff --git a/kernel/kthread.c b/kernel/kthread.c index e29773c82b70..933a625621b8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -897,7 +897,8 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); - kthread_insert_work(worker, work, &worker->work_list); + if (!work->canceling) + kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3e99dfef8408..d9fb9e19d2ed 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -84,7 +84,7 @@ static inline bool lockdep_enabled(void) if (!debug_locks) return false; - if (raw_cpu_read(lockdep_recursion)) + if (this_cpu_read(lockdep_recursion)) return false; if (current->lockdep_recursion) @@ -2765,7 +2765,9 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * (Note that this has to be done separately, because the graph cannot * detect such classes of deadlocks.) * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same + * lock class is held but nest_lock is also held, i.e. we rely on the + * nest_lock to avoid the deadlock. */ static int check_deadlock(struct task_struct *curr, struct held_lock *next) @@ -2788,7 +2790,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((next->read == 2) && prev->read) - return 2; + continue; /* * We're holding the nest_lock, which serializes this lock's @@ -3593,15 +3595,12 @@ static int validate_chain(struct task_struct *curr, if (!ret) return 0; /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: + * of the chain, and if the new lock introduces no more + * lock dependency (because we already hold a lock with the + * same lock class) nor deadlock (because the nest_lock + * serializes nesting locks), see the comments for + * check_deadlock(). */ if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) @@ -4057,7 +4056,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(in_nmi())) return; - if (unlikely(__this_cpu_read(lockdep_recursion))) + if (unlikely(this_cpu_read(lockdep_recursion))) return; if (unlikely(lockdep_hardirqs_enabled())) { @@ -4126,7 +4125,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) goto skip_checks; } - if (unlikely(__this_cpu_read(lockdep_recursion))) + if (unlikely(this_cpu_read(lockdep_recursion))) return; if (lockdep_hardirqs_enabled()) { @@ -4396,6 +4395,9 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (unlikely(hlock_class(this)->usage_mask & new_mask)) goto unlock; + if (!hlock_class(this)->usage_mask) + debug_atomic_dec(nr_unused_locks); + hlock_class(this)->usage_mask |= new_mask; if (new_bit < LOCK_TRACE_STATES) { @@ -4403,19 +4405,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; } - switch (new_bit) { - case 0 ... LOCK_USED-1: + if (new_bit < LOCK_USED) { ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; - break; - - case LOCK_USED: - debug_atomic_dec(nr_unused_locks); - break; - - default: - break; } unlock: diff --git a/kernel/panic.c b/kernel/panic.c index 396142ee43fd..332736a72a58 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - dump_stack(); + if (!regs) + dump_stack(); print_irqtrace_events(current); diff --git a/kernel/params.c b/kernel/params.c index 3835fb82c64b..164d79330849 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -530,7 +530,7 @@ struct module_param_attrs { unsigned int num; struct attribute_group grp; - struct param_attribute attrs[0]; + struct param_attribute attrs[]; }; #ifdef CONFIG_SYSFS diff --git a/kernel/power/process.c b/kernel/power/process.c index 4b6a54da7e65..45b054b7b5ec 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,7 +146,7 @@ int freeze_processes(void) BUG_ON(in_atomic()); /* - * Now that the whole userspace is frozen we need to disbale + * Now that the whole userspace is frozen we need to disable * the OOM killer to disallow any further interference with * killable tasks. There is no guarantee oom victims will * ever reach a point they go away we have to wait with a timeout. diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 24a960a89aa8..6b1525685277 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -345,7 +345,7 @@ DESC_ID((id) - DESCS_COUNT(desc_ring)) */ struct prb_data_block { unsigned long id; - char data[0]; + char data[]; }; /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06895ef85d69..bd04b09b84b3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -409,7 +409,7 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -void rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_dyntick_idle(void) { int special; @@ -4077,7 +4077,6 @@ void rcu_cpu_starting(unsigned int cpu) smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing function has no further need of RCU, so remove it from * the rcu_node tree's ->qsmaskinitnext bit masks. @@ -4117,6 +4116,7 @@ void rcu_report_dead(unsigned int cpu) rdp->cpu_started = false; } +#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing CPU has just passed through the dying-idle state, and we * are being invoked from the CPU that was IPIed to continue the offline diff --git a/kernel/reboot.c b/kernel/reboot.c index e7b78d5ae1ab..af6f23d8bea1 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -551,22 +551,22 @@ static int __init reboot_setup(char *str) break; case 's': - { - int rc; - - if (isdigit(*(str+1))) { - rc = kstrtoint(str+1, 0, &reboot_cpu); - if (rc) - return rc; - } else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) { - rc = kstrtoint(str+3, 0, &reboot_cpu); - if (rc) - return rc; - } else + if (isdigit(*(str+1))) + reboot_cpu = simple_strtoul(str+1, NULL, 0); + else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else *mode = REBOOT_SOFT; + if (reboot_cpu >= num_possible_cpus()) { + pr_err("Ignoring the CPU number in reboot= option. " + "CPU %d exceeds possible cpu number %d\n", + reboot_cpu, num_possible_cpus()); + reboot_cpu = 0; + break; + } break; - } + case 'g': *mode = REBOOT_GPIO; break; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e254745a82cb..97d318b0cd0c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -102,8 +102,12 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->next_freq == next_freq) - return false; + if (!sg_policy->need_freq_update) { + if (sg_policy->next_freq == next_freq) + return false; + } else { + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -164,7 +168,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -440,7 +443,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long util, max; unsigned int next_f; - bool busy; unsigned int cached_freq = sg_policy->cached_raw_freq; sugov_iowait_boost(sg_cpu, time, flags); @@ -451,9 +453,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; - /* Limits may have changed, don't skip frequency update */ - busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); - util = sugov_get_util(sg_cpu); max = sg_cpu->max; util = sugov_iowait_apply(sg_cpu, time, util, max); @@ -462,7 +461,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) { + if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ @@ -827,9 +826,10 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->limits_changed = false; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); @@ -881,7 +881,7 @@ static void sugov_limits(struct cpufreq_policy *policy) struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, - .dynamic_switching = true, + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0655524700d2..2357921580f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -251,7 +251,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, unsigned long flags = *(unsigned long *)table->data; size_t data_size = 0; size_t len = 0; - char *tmp; + char *tmp, *buf; int idx; if (write) @@ -269,17 +269,17 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, return 0; } - tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); - if (!tmp) + buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL); + if (!buf) return -ENOMEM; for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { char *name = sd_flag_debug[idx].name; - len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); + len += snprintf(buf + len, strlen(name) + 2, "%s ", name); } - tmp += *ppos; + tmp = buf + *ppos; len -= *ppos; if (len > *lenp) @@ -294,7 +294,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, *lenp = len; *ppos += len; - kfree(tmp); + kfree(buf); return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 290f9e38378c..8917d2d715ef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6172,21 +6172,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long best_cap = 0; + unsigned long task_util, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; - sync_entity_load_avg(&p->se); - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + task_util = uclamp_task_util(p); + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (task_fits_capacity(p, cpu_cap)) + if (fits_capacity(task_util, cpu_cap)) return cpu; if (cpu_cap > best_cap) { @@ -6198,44 +6198,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } +static inline bool asym_fits_capacity(int task_util, int cpu) +{ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + return fits_capacity(task_util, capacity_of(cpu)); + + return true; +} + /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; + unsigned long task_util; int i, recent_used_cpu; /* - * For asymmetric CPU capacity systems, our domain of interest is - * sd_asym_cpucapacity rather than sd_llc. + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. */ if (static_branch_unlikely(&sched_asym_cpucapacity)) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); - /* - * On an asymmetric CPU capacity system where an exclusive - * cpuset defines a symmetric island (i.e. one unique - * capacity_orig value through the cpuset), the key will be set - * but the CPUs within that cpuset will not have a domain with - * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric - * capacity path. - */ - if (!sd) - goto symmetric; - - i = select_idle_capacity(p, sd, target); - return ((unsigned)i < nr_cpumask_bits) ? i : target; + sync_entity_load_avg(&p->se); + task_util = uclamp_task_util(p); } -symmetric: - if (available_idle_cpu(target) || sched_idle_cpu(target)) + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_capacity(task_util, prev)) return prev; /* @@ -6258,7 +6256,8 @@ symmetric: recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_capacity(task_util, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6267,6 +6266,26 @@ symmetric: return recent_used_cpu; } + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; @@ -9031,7 +9050,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest. */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity diff --git a/kernel/signal.c b/kernel/signal.c index a38b3edc6851..ef8f2a28d37c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -391,16 +391,17 @@ static bool task_participate_group_stop(struct task_struct *task) void task_join_group_stop(struct task_struct *task) { + unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; + struct signal_struct *sig = current->signal; + + if (sig->group_stop_count) { + sig->group_stop_count++; + mask |= JOBCTL_STOP_CONSUME; + } else if (!(sig->flags & SIGNAL_STOP_STOPPED)) + return; + /* Have the new thread join an on-going signal group stop */ - unsigned long jobctl = current->jobctl; - if (jobctl & JOBCTL_STOP_PENDING) { - struct signal_struct *sig = current->signal; - unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; - unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; - if (task_set_jobctl_pending(task, signr | gstop)) { - sig->group_stop_count++; - } - } + task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 865bb0228ab6..890b79cf0e7c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -178,7 +178,7 @@ static void ack_state(struct multi_stop_data *msdata) set_state(msdata, msdata->state + 1); } -void __weak stop_machine_yield(const struct cpumask *cpumask) +notrace void __weak stop_machine_yield(const struct cpumask *cpumask) { cpu_relax(); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3624b9b5835d..387b4bef7dd1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -425,11 +425,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_free(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index ca4e6d57d68b..00629e658ca1 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -172,10 +172,6 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; - /* - * Use the to_ktime conversion because that clamps the maximum - * value to KTIME_MAX and avoid multiplication overflows. - */ nval = timespec64_to_ns(&value->it_value); ninterval = timespec64_to_ns(&value->it_interval); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0642013dace4..b1b9b12899f5 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -68,13 +68,13 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } -struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { *seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } -int sched_clock_read_retry(unsigned int seq) +notrace int sched_clock_read_retry(unsigned int seq) { return read_seqcount_latch_retry(&cd.seq, seq); } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index de37e33a868d..c3ad64fb9d8b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -732,11 +732,6 @@ static inline void debug_timer_deactivate(struct timer_list *timer) debug_object_deactivate(timer, &timer_debug_descr); } -static inline void debug_timer_free(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} - static inline void debug_timer_assert_init(struct timer_list *timer) { debug_object_assert_init(timer, &timer_debug_descr); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f45fd9d5a45..dc83b3fa9fe7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -438,14 +438,16 @@ enum { }; /* * Used for which event context the event is in. - * NMI = 0 - * IRQ = 1 - * SOFTIRQ = 2 - * NORMAL = 3 + * TRANSITION = 0 + * NMI = 1 + * IRQ = 2 + * SOFTIRQ = 3 + * NORMAL = 4 * * See trace_recursive_lock() comment below for more details. */ enum { + RB_CTX_TRANSITION, RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, @@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * a bit of overhead in something as critical as function tracing, * we use a bitmask trick. * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. + * bit 1 = NMI context + * bit 2 = IRQ context + * bit 3 = SoftIRQ context + * bit 4 = normal context. * * This works because this is the order of contexts that can * preempt other contexts. A SoftIRQ never preempts an IRQ @@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The least significant bit can be cleared this way, and it * just so happens that it is the same bit corresponding to * the current context. + * + * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit + * is set when a recursion is detected at the current context, and if + * the TRANSITION bit is already set, it will fail the recursion. + * This is needed because there's a lag between the changing of + * interrupt context and updating the preempt count. In this case, + * a false positive will be found. To handle this, one extra recursion + * is allowed, and this is done by the TRANSITION bit. If the TRANSITION + * bit is already set, then it is considered a recursion and the function + * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. + * + * On the trace_recursive_unlock(), the TRANSITION bit will be the first + * to be cleared. Even if it wasn't the context that set it. That is, + * if an interrupt comes in while NORMAL bit is set and the ring buffer + * is called before preempt_count() is updated, since the check will + * be on the NORMAL bit, the TRANSITION bit will then be set. If an + * NMI then comes in, it will set the NMI bit, but when the NMI code + * does the trace_recursive_unlock() it will clear the TRANSTION bit + * and leave the NMI bit set. But this is fine, because the interrupt + * code that set the TRANSITION bit will then clear the NMI bit when it + * calls trace_recursive_unlock(). If another NMI comes in, it will + * set the TRANSITION bit and continue. + * + * Note: The TRANSITION bit only handles a single transition between context. */ static __always_inline int @@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = pc & NMI_MASK ? RB_CTX_NMI : pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; - if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) - return 1; + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { + /* + * It is possible that this was called by transitioning + * between interrupt context, and preempt_count() has not + * been updated yet. In this case, use the TRANSITION bit. + */ + bit = RB_CTX_TRANSITION; + if (val & (1 << (bit + cpu_buffer->nest))) + return 1; + } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; @@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->current_context - (1 << cpu_buffer->nest); } -/* The recursive locking above uses 4 bits */ -#define NESTED_BITS 4 +/* The recursive locking above uses 5 bits */ +#define NESTED_BITS 5 /** * ring_buffer_nest_start - Allow to trace while nested diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 528971714fc6..410cfeb16db5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2750,7 +2750,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer - * to store the trace event for the tigger to use. It's recusive + * to store the trace event for the trigger to use. It's recursive * safe and will not be recorded anywhere. */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { @@ -2952,7 +2952,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; /* This should never happen. If it does, yell once and skip */ - if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) + if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) goto out; /* @@ -3132,7 +3132,7 @@ static char *get_trace_buf(void) /* Interrupts must see nesting incremented before we use the buffer */ barrier(); - return &buffer->buffer[buffer->nesting][0]; + return &buffer->buffer[buffer->nesting - 1][0]; } static void put_trace_buf(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f3f5e77123ad..1dadef445cd1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -637,6 +637,12 @@ enum { * function is called to clear it. */ TRACE_GRAPH_NOTRACE_BIT, + + /* + * When transitioning between context, the preempt_count() may + * not be correct. Allow for a single recursion to cover this case. + */ + TRACE_TRANSITION_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) @@ -691,14 +697,27 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) return 0; bit = trace_get_context_bit() + start; - if (unlikely(val & (1 << bit))) - return -1; + if (unlikely(val & (1 << bit))) { + /* + * It could be that preempt_count has not been updated during + * a switch between contexts. Allow for a single recursion. + */ + bit = TRACE_TRANSITION_BIT; + if (trace_recursion_test(bit)) + return -1; + trace_recursion_set(bit); + barrier(); + return bit + 1; + } + + /* Normal check passed, clear the transition to allow it again */ + trace_recursion_clear(TRACE_TRANSITION_BIT); val |= 1 << bit; current->trace_recursion = val; barrier(); - return bit; + return bit + 1; } static __always_inline void trace_clear_recursion(int bit) @@ -708,6 +727,7 @@ static __always_inline void trace_clear_recursion(int bit) if (!bit) return; + bit--; bit = 1 << bit; val &= ~bit; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 3212e2c653b3..881df991742a 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -584,7 +584,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, { struct synth_field *field; const char *prefix = NULL, *field_type = argv[0], *field_name, *array; - int len, ret = 0; + int len, ret = -ENOMEM; + struct seq_buf s; ssize_t size; if (field_type[0] == ';') @@ -616,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, len--; field->name = kmemdup_nul(field_name, len, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; + if (!field->name) goto free; - } + if (!is_good_name(field->name)) { synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name)); ret = -EINVAL; @@ -630,29 +630,29 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, field_type++; len = strlen(field_type) + 1; - if (array) { - int l = strlen(array); + if (array) + len += strlen(array); - if (l && array[l - 1] == ';') - l--; - len += l; - } if (prefix) len += strlen(prefix); field->type = kzalloc(len, GFP_KERNEL); - if (!field->type) { - ret = -ENOMEM; + if (!field->type) goto free; - } + + seq_buf_init(&s, field->type, len); if (prefix) - strcat(field->type, prefix); - strcat(field->type, field_type); + seq_buf_puts(&s, prefix); + seq_buf_puts(&s, field_type); if (array) { - strcat(field->type, array); - if (field->type[len - 1] == ';') - field->type[len - 1] = '\0'; + seq_buf_puts(&s, array); + if (s.buffer[s.len - 1] == ';') + s.len--; } + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + + s.buffer[s.len] = '\0'; size = synth_field_size(field->type); if (size < 0) { @@ -663,14 +663,19 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, if (synth_field_is_string(field->type)) { char *type; - type = kzalloc(sizeof("__data_loc ") + strlen(field->type) + 1, GFP_KERNEL); - if (!type) { - ret = -ENOMEM; + len = sizeof("__data_loc ") + strlen(field->type) + 1; + type = kzalloc(len, GFP_KERNEL); + if (!type) goto free; - } - strcat(type, "__data_loc "); - strcat(type, field->type); + seq_buf_init(&s, type, len); + seq_buf_puts(&s, "__data_loc "); + seq_buf_puts(&s, field->type); + + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + s.buffer[s.len] = '\0'; + kfree(field->type); field->type = type; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b5e3496cf803..4738ad48a667 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -492,8 +492,13 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_rec_probe); ret = -1; - if (trace_selftest_recursion_cnt != 1) { - pr_cont("*callback not called once (%d)* ", + /* + * Recursion allows for transitions between context, + * and may call the callback twice. + */ + if (trace_selftest_recursion_cnt != 1 && + trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called once (or twice) (%d)* ", trace_selftest_recursion_cnt); goto out; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 26efd22f0633..3f659f855074 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -50,7 +50,7 @@ static bool ok_to_free_tracepoints; */ struct tp_probes { struct rcu_head rcu; - struct tracepoint_func probes[0]; + struct tracepoint_func probes[]; }; static inline void *allocate_probes(int count) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5abb5b22ad13..71109065bd8e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly nmi_watchdog_available; -static struct cpumask watchdog_allowed_mask __read_mostly; - struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +static struct cpumask watchdog_allowed_mask __read_mostly; + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; |