From 245282557c49754af3dbcc732316e814340d6bce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Jan 2012 11:58:43 +0800 Subject: cgroup: move struct cgroup_pidlist out from the header file It's internally used only. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5d3b5325f77..6ca7acad7c55 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3043,6 +3043,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * */ +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* how many files are using the current array */ + int use_count; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* protects the other fields */ + struct rw_semaphore mutex; +}; + /* * The following two functions "fix" the issue where there are more pids * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. -- cgit From b78949ebfb563c29808a9d0a772e3adb5561bc80 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Tue, 3 Jan 2012 21:18:30 -0800 Subject: cgroup: simplify double-check locking in cgroup_attach_proc To keep the complexity of the double-check locking in one place, move the thread_group_leader check up into attach_task_by_pid(). This allows us to use a goto instead of returning -EAGAIN. While at it, convert a couple of returns to gotos and use rcu for the !pid case also in order to simplify the logic. Changes in V2: * https://lkml.org/lkml/2011/12/22/86 (Tejun Heo) * Use a goto instead of returning -EAGAIN Signed-off-by: Mandeep Singh Baines Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: Frederic Weisbecker Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 79 +++++++++++++++++++++------------------------------------ 1 file changed, 29 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6ca7acad7c55..12c07e8fd69c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2104,19 +2104,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* prevent changes to the threadgroup list while we take a snapshot. */ read_lock(&tasklist_lock); - if (!thread_group_leader(leader)) { - /* - * a race with de_thread from another thread's exec() may strip - * us of our leadership, making while_each_thread unsafe to use - * on this task. if this happens, there is no choice but to - * throw this task away and try again (from cgroup_procs_write); - * this is "double-double-toil-and-trouble-check locking". - */ - read_unlock(&tasklist_lock); - retval = -EAGAIN; - goto out_free_group_list; - } - tsk = leader; i = 0; do { @@ -2245,22 +2232,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) if (!cgroup_lock_live_group(cgrp)) return -ENODEV; +retry_find_task: + rcu_read_lock(); if (pid) { - rcu_read_lock(); tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); - cgroup_unlock(); - return -ESRCH; - } - if (threadgroup) { - /* - * RCU protects this access, since tsk was found in the - * tid map. a race with de_thread may cause group_leader - * to stop being the leader, but cgroup_attach_proc will - * detect it later. - */ - tsk = tsk->group_leader; + ret= -ESRCH; + goto out_unlock_cgroup; } /* * even if we're attaching all tasks in the thread group, we @@ -2271,29 +2250,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); - cgroup_unlock(); - return -EACCES; + ret = -EACCES; + goto out_unlock_cgroup; } - get_task_struct(tsk); - rcu_read_unlock(); - } else { - if (threadgroup) - tsk = current->group_leader; - else - tsk = current; - get_task_struct(tsk); - } - - threadgroup_lock(tsk); + } else + tsk = current; if (threadgroup) + tsk = tsk->group_leader; + get_task_struct(tsk); + rcu_read_unlock(); + + threadgroup_lock(tsk); + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * a race with de_thread from another thread's exec() + * may strip us of our leadership, if this happens, + * there is no choice but to throw this task away and + * try again; this is + * "double-double-toil-and-trouble-check locking". + */ + threadgroup_unlock(tsk); + put_task_struct(tsk); + goto retry_find_task; + } ret = cgroup_attach_proc(cgrp, tsk); - else + } else ret = cgroup_attach_task(cgrp, tsk); - threadgroup_unlock(tsk); put_task_struct(tsk); +out_unlock_cgroup: cgroup_unlock(); return ret; } @@ -2305,16 +2293,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) { - int ret; - do { - /* - * attach_proc fails with -EAGAIN if threadgroup leadership - * changes in the middle of the operation, in which case we need - * to find the task_struct for the new leader and start over. - */ - ret = attach_task_by_pid(cgrp, tgid, true); - } while (ret == -EAGAIN); - return ret; + return attach_task_by_pid(cgrp, tgid, true); } /** -- cgit From fb5d2b4cfc24963d0e8a7df57de1ecffa10a04cf Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Tue, 3 Jan 2012 21:18:31 -0800 Subject: cgroup: replace tasklist_lock with rcu_read_lock We can replace the tasklist_lock in cgroup_attach_proc with an rcu_read_lock(). Changes in V4: * https://lkml.org/lkml/2011/12/23/284 (Frederic Weisbecker) * Minimize size of rcu_read_lock critical section * Add comment * https://lkml.org/lkml/2011/12/26/136 (Li Zefan) * Split into two patches Changes in V3: * https://lkml.org/lkml/2011/12/22/419 (Frederic Weisbecker) * Add an rcu_read_lock to protect against exit Changes in V2: * https://lkml.org/lkml/2011/12/22/86 (Tejun Heo) * Use a goto instead of returning -EAGAIN Suggested-by: Frederic Weisbecker Signed-off-by: Mandeep Singh Baines Acked-by: Li Zefan Acked-by: Frederic Weisbecker Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 12c07e8fd69c..1626152dcc1e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2102,10 +2102,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (retval) goto out_free_group_list; - /* prevent changes to the threadgroup list while we take a snapshot. */ - read_lock(&tasklist_lock); tsk = leader; i = 0; + /* + * Prevent freeing of tasks while we take a snapshot. Tasks that are + * already PF_EXITING could be freed from underneath us unless we + * take an rcu_read_lock. + */ + rcu_read_lock(); do { struct task_and_cgroup ent; @@ -2128,11 +2132,11 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); + rcu_read_unlock(); /* remember the number of threads in the array for later. */ group_size = i; tset.tc_array = group; tset.tc_array_len = group_size; - read_unlock(&tasklist_lock); /* methods shouldn't be called if no task is actually migrating */ retval = 0; -- cgit From 0ce8974d504913a0f0ae2d97b20a5ac665431a41 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 03:13:27 -0800 Subject: sysctl: Consolidate !CONFIG_SYSCTL handling - In sysctl.h move functions only available if CONFIG_SYSCL is defined inside of #ifdef CONFIG_SYSCTL - Move the stub function definitions for !CONFIG_SYSCTL into sysctl.h and make them static inlines. Signed-off-by: Eric W. Biederman --- kernel/sysctl.c | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f487f257e05e..d5bbddd0de24 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2017,32 +2017,6 @@ void setup_sysctl_set(struct ctl_table_set *p, p->is_seen = is_seen; } -#else /* !CONFIG_SYSCTL */ -struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -{ - return NULL; -} - -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return NULL; -} - -void unregister_sysctl_table(struct ctl_table_header * table) -{ -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ -} - -void sysctl_head_put(struct ctl_table_header *head) -{ -} - #endif /* CONFIG_SYSCTL */ /* -- cgit From de4e83bd6b5e16d491ec068cd22801d5d063b07a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 03:34:20 -0800 Subject: sysctl: Register the base sysctl table like any other sysctl table. Simplify the code by treating the base sysctl table like any other sysctl table and register it with register_sysctl_table. To ensure this table is registered early enough to avoid problems call sysctl_init from proc_sys_init. Rename sysctl_net.c:sysctl_init() to net_sysctl_init() to avoid name conflicts now that kernel/sysctl.c:sysctl_init() is no longer static. Signed-off-by: Eric W. Biederman --- kernel/sysctl.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d5bbddd0de24..ad460248acc7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -192,7 +192,7 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, #endif -static struct ctl_table root_table[]; +static struct ctl_table root_table[1]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { {{.count = 1, @@ -222,7 +222,7 @@ int sysctl_legacy_va_layout; /* The default sysctl tables: */ -static struct ctl_table root_table[] = { +static struct ctl_table sysctl_base_table[] = { { .procname = "kernel", .mode = 0555, @@ -1747,17 +1747,12 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) } } -static __init int sysctl_init(void) +int __init sysctl_init(void) { - sysctl_set_parent(NULL, root_table); -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - sysctl_check_table(current->nsproxy, root_table); -#endif + register_sysctl_table(sysctl_base_table); return 0; } -core_initcall(sysctl_init); - static struct ctl_table *is_branch_in(struct ctl_table *branch, struct ctl_table *table) { -- cgit From 1f87f0b52b1d6581168cb80f86746bc4df918d01 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 04:07:15 -0800 Subject: sysctl: Move the implementation into fs/proc/proc_sysctl.c Move the core sysctl code from kernel/sysctl.c and kernel/sysctl_check.c into fs/proc/proc_sysctl.c. Currently sysctl maintenance is hampered by the sysctl implementation being split across 3 files with artificial layering between them. Consolidate the entire sysctl implementation into 1 file so that it is easier to see what is going on and hopefully allowing for simpler maintenance. For functions that are now only used in fs/proc/proc_sysctl.c remove their declarations from sysctl.h and make them static in fs/proc/proc_sysctl.c Signed-off-by: Eric W. Biederman --- kernel/Makefile | 1 - kernel/sysctl.c | 464 -------------------------------------------------- kernel/sysctl_check.c | 160 ----------------- 3 files changed, 625 deletions(-) delete mode 100644 kernel/sysctl_check.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2d9de86b7e76..cb41b9547c9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,7 +27,6 @@ obj-y += power/ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o -obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ad460248acc7..b774909ed46c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -192,20 +192,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, #endif -static struct ctl_table root_table[1]; -static struct ctl_table_root sysctl_table_root; -static struct ctl_table_header root_table_header = { - {{.count = 1, - .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, - .root = &sysctl_table_root, - .set = &sysctl_table_root.default_set, -}; -static struct ctl_table_root sysctl_table_root = { - .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), -}; - static struct ctl_table kern_table[]; static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; @@ -1559,459 +1545,12 @@ static struct ctl_table dev_table[] = { { } }; -static DEFINE_SPINLOCK(sysctl_lock); - -/* called under sysctl_lock */ -static int use_table(struct ctl_table_header *p) -{ - if (unlikely(p->unregistering)) - return 0; - p->used++; - return 1; -} - -/* called under sysctl_lock */ -static void unuse_table(struct ctl_table_header *p) -{ - if (!--p->used) - if (unlikely(p->unregistering)) - complete(p->unregistering); -} - -/* called under sysctl_lock, will reacquire if has to wait */ -static void start_unregistering(struct ctl_table_header *p) -{ - /* - * if p->used is 0, nobody will ever touch that entry again; - * we'll eliminate all paths to it before dropping sysctl_lock - */ - if (unlikely(p->used)) { - struct completion wait; - init_completion(&wait); - p->unregistering = &wait; - spin_unlock(&sysctl_lock); - wait_for_completion(&wait); - spin_lock(&sysctl_lock); - } else { - /* anything non-NULL; we'll never dereference it */ - p->unregistering = ERR_PTR(-EINVAL); - } - /* - * do not remove from the list until nobody holds it; walking the - * list in do_sysctl() relies on that. - */ - list_del_init(&p->ctl_entry); -} - -void sysctl_head_get(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - head->count++; - spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - if (!--head->count) - kfree_rcu(head, rcu); - spin_unlock(&sysctl_lock); -} - -struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) -{ - if (!head) - BUG(); - spin_lock(&sysctl_lock); - if (!use_table(head)) - head = ERR_PTR(-ENOENT); - spin_unlock(&sysctl_lock); - return head; -} - -void sysctl_head_finish(struct ctl_table_header *head) -{ - if (!head) - return; - spin_lock(&sysctl_lock); - unuse_table(head); - spin_unlock(&sysctl_lock); -} - -static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = &root->default_set; - if (root->lookup) - set = root->lookup(root, namespaces); - return set; -} - -static struct list_head * -lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = lookup_header_set(root, namespaces); - return &set->list; -} - -struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, - struct ctl_table_header *prev) -{ - struct ctl_table_root *root; - struct list_head *header_list; - struct ctl_table_header *head; - struct list_head *tmp; - - spin_lock(&sysctl_lock); - if (prev) { - head = prev; - tmp = &prev->ctl_entry; - unuse_table(prev); - goto next; - } - tmp = &root_table_header.ctl_entry; - for (;;) { - head = list_entry(tmp, struct ctl_table_header, ctl_entry); - - if (!use_table(head)) - goto next; - spin_unlock(&sysctl_lock); - return head; - next: - root = head->root; - tmp = tmp->next; - header_list = lookup_header_list(root, namespaces); - if (tmp != header_list) - continue; - - do { - root = list_entry(root->root_list.next, - struct ctl_table_root, root_list); - if (root == &sysctl_table_root) - goto out; - header_list = lookup_header_list(root, namespaces); - } while (list_empty(header_list)); - tmp = header_list->next; - } -out: - spin_unlock(&sysctl_lock); - return NULL; -} - -struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) -{ - return __sysctl_head_next(current->nsproxy, prev); -} - -void register_sysctl_root(struct ctl_table_root *root) -{ - spin_lock(&sysctl_lock); - list_add_tail(&root->root_list, &sysctl_table_root.root_list); - spin_unlock(&sysctl_lock); -} - -/* - * sysctl_perm does NOT grant the superuser all rights automatically, because - * some sysctl variables are readonly even to root. - */ - -static int test_perm(int mode, int op) -{ - if (!current_euid()) - mode >>= 6; - else if (in_egroup_p(0)) - mode >>= 3; - if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) - return 0; - return -EACCES; -} - -int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) -{ - int mode; - - if (root->permissions) - mode = root->permissions(root, current->nsproxy, table); - else - mode = table->mode; - - return test_perm(mode, op); -} - -static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) -{ - for (; table->procname; table++) { - table->parent = parent; - if (table->child) - sysctl_set_parent(table, table->child); - } -} - int __init sysctl_init(void) { register_sysctl_table(sysctl_base_table); return 0; } -static struct ctl_table *is_branch_in(struct ctl_table *branch, - struct ctl_table *table) -{ - struct ctl_table *p; - const char *s = branch->procname; - - /* branch should have named subdirectory as its first element */ - if (!s || !branch->child) - return NULL; - - /* ... and nothing else */ - if (branch[1].procname) - return NULL; - - /* table should contain subdirectory with the same name */ - for (p = table; p->procname; p++) { - if (!p->child) - continue; - if (p->procname && strcmp(p->procname, s) == 0) - return p; - } - return NULL; -} - -/* see if attaching q to p would be an improvement */ -static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) -{ - struct ctl_table *to = p->ctl_table, *by = q->ctl_table; - struct ctl_table *next; - int is_better = 0; - int not_in_parent = !p->attached_by; - - while ((next = is_branch_in(by, to)) != NULL) { - if (by == q->attached_by) - is_better = 1; - if (to == p->attached_by) - not_in_parent = 1; - by = by->child; - to = next->child; - } - - if (is_better && not_in_parent) { - q->attached_by = by; - q->attached_to = to; - q->parent = p; - } -} - -/** - * __register_sysctl_paths - register a sysctl hierarchy - * @root: List of sysctl headers to register on - * @namespaces: Data to compute which lists of sysctl entries are visible - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * The members of the &struct ctl_table structure are used as follows: - * - * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not - * enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file, and for sysctl(2) - * - * child - a pointer to the child sysctl table if this entry is a directory, or - * %NULL. - * - * proc_handler - the text handler routine (described below) - * - * de - for internal use by the sysctl routines - * - * extra1, extra2 - extra pointers usable by the proc handler routines - * - * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. - * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table. The data and maxlen fields of the ctl_table - * struct enable minimal validation of the values being written to be - * performed, and the mode field allows minimal authentication. - * - * There must be a proc_handler routine for any terminal nodes - * mirrored under /proc/sys (non-terminals are handled by a built-in - * directory handler). Several default handlers are available to - * cover common cases - - * - * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), - * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), - * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() - * - * It is the handler's job to read the input buffer from user memory - * and process it. The handler should return 0 on success. - * - * This routine returns %NULL on a failure to register, and a pointer - * to the table header on success. - */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_root *root, - struct nsproxy *namespaces, - const struct ctl_path *path, struct ctl_table *table) -{ - struct ctl_table_header *header; - struct ctl_table *new, **prevp; - unsigned int n, npath; - struct ctl_table_set *set; - - /* Count the path components */ - for (npath = 0; path[npath].procname; ++npath) - ; - - /* - * For each path component, allocate a 2-element ctl_table array. - * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (procname == 0). - * - * We allocate everything in one go so that we don't have to - * worry about freeing additional memory in unregister_sysctl_table. - */ - header = kzalloc(sizeof(struct ctl_table_header) + - (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); - if (!header) - return NULL; - - new = (struct ctl_table *) (header + 1); - - /* Now connect the dots */ - prevp = &header->ctl_table; - for (n = 0; n < npath; ++n, ++path) { - /* Copy the procname */ - new->procname = path->procname; - new->mode = 0555; - - *prevp = new; - prevp = &new->child; - - new += 2; - } - *prevp = table; - header->ctl_table_arg = table; - - INIT_LIST_HEAD(&header->ctl_entry); - header->used = 0; - header->unregistering = NULL; - header->root = root; - sysctl_set_parent(NULL, header->ctl_table); - header->count = 1; -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - if (sysctl_check_table(namespaces, header->ctl_table)) { - kfree(header); - return NULL; - } -#endif - spin_lock(&sysctl_lock); - header->set = lookup_header_set(root, namespaces); - header->attached_by = header->ctl_table; - header->attached_to = root_table; - header->parent = &root_table_header; - for (set = header->set; set; set = set->parent) { - struct ctl_table_header *p; - list_for_each_entry(p, &set->list, ctl_entry) { - if (p->unregistering) - continue; - try_attach(p, header); - } - } - header->parent->count++; - list_add_tail(&header->ctl_entry, &header->set->list); - spin_unlock(&sysctl_lock); - - return header; -} - -/** - * register_sysctl_table_path - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, - path, table); -} - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); -} - -/** - * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table - * - * Unregisters the sysctl table and all children. proc entries may not - * actually be removed until they are no longer used by anyone. - */ -void unregister_sysctl_table(struct ctl_table_header * header) -{ - might_sleep(); - - if (header == NULL) - return; - - spin_lock(&sysctl_lock); - start_unregistering(header); - if (!--header->parent->count) { - WARN_ON(1); - kfree_rcu(header->parent, rcu); - } - if (!--header->count) - kfree_rcu(header, rcu); - spin_unlock(&sysctl_lock); -} - -int sysctl_is_seen(struct ctl_table_header *p) -{ - struct ctl_table_set *set = p->set; - int res; - spin_lock(&sysctl_lock); - if (p->unregistering) - res = 0; - else if (!set->is_seen) - res = 1; - else - res = set->is_seen(set); - spin_unlock(&sysctl_lock); - return res; -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ - INIT_LIST_HEAD(&p->list); - p->parent = parent ? parent : &sysctl_table_root.default_set; - p->is_seen = is_seen; -} - #endif /* CONFIG_SYSCTL */ /* @@ -2977,6 +2516,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); -EXPORT_SYMBOL(register_sysctl_table); -EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c deleted file mode 100644 index 362da653813d..000000000000 --- a/kernel/sysctl_check.c +++ /dev/null @@ -1,160 +0,0 @@ -#include -#include -#include "../fs/xfs/xfs_sysctl.h" -#include -#include -#include - - -static int sysctl_depth(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth; - - depth = 0; - for (tmp = table; tmp->parent; tmp = tmp->parent) - depth++; - - return depth; -} - -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) -{ - int i; - - for (i = 0; table && i < n; i++) - table = table->parent; - - return table; -} - - -static void sysctl_print_path(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth, i; - depth = sysctl_depth(table); - if (table->procname) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk("/%s", tmp->procname?tmp->procname:""); - } - } - printk(" "); -} - -static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table_header *head; - struct ctl_table *ref, *test; - int depth, cur_depth; - - depth = sysctl_depth(table); - - for (head = __sysctl_head_next(namespaces, NULL); head; - head = __sysctl_head_next(namespaces, head)) { - cur_depth = depth; - ref = head->ctl_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->procname; ref++) { - int match = 0; - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - } - ref = NULL; -out: - sysctl_head_finish(head); - return ref; -} - -static void set_fail(const char **fail, struct ctl_table *table, const char *str) -{ - if (*fail) { - printk(KERN_ERR "sysctl table check failed: "); - sysctl_print_path(table); - printk(" %s\n", *fail); - dump_stack(); - } - *fail = str; -} - -static void sysctl_check_leaf(struct nsproxy *namespaces, - struct ctl_table *table, const char **fail) -{ - struct ctl_table *ref; - - ref = sysctl_check_lookup(namespaces, table); - if (ref && (ref != table)) - set_fail(fail, table, "Sysctl already exists"); -} - -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) -{ - int error = 0; - for (; table->procname; table++) { - const char *fail = NULL; - - if (table->parent) { - if (!table->parent->procname) - set_fail(&fail, table, "Parent without procname"); - } - if (table->child) { - if (table->data) - set_fail(&fail, table, "Directory with data?"); - if (table->maxlen) - set_fail(&fail, table, "Directory with maxlen?"); - if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) - set_fail(&fail, table, "Writable sysctl directory"); - if (table->proc_handler) - set_fail(&fail, table, "Directory with proc_handler"); - if (table->extra1) - set_fail(&fail, table, "Directory with extra1"); - if (table->extra2) - set_fail(&fail, table, "Directory with extra2"); - } else { - if ((table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - set_fail(&fail, table, "No data"); - if (!table->maxlen) - set_fail(&fail, table, "No maxlen"); - } -#ifdef CONFIG_PROC_SYSCTL - if (!table->proc_handler) - set_fail(&fail, table, "No proc_handler"); -#endif - sysctl_check_leaf(namespaces, table, &fail); - } - if (table->mode > 0777) - set_fail(&fail, table, "bogus .mode"); - if (fail) { - set_fail(&fail, table, NULL); - error = -EINVAL; - } - if (table->child) - error |= sysctl_check_table(namespaces, table->child); - } - return error; -} -- cgit From cf579dfb82550e34de7ccf3ef090d8b834ccd3a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 29 Jan 2012 20:38:29 +0100 Subject: PM / Sleep: Introduce "late suspend" and "early resume" of devices The current device suspend/resume phases during system-wide power transitions appear to be insufficient for some platforms that want to use the same callback routines for saving device states and related operations during runtime suspend/resume as well as during system suspend/resume. In principle, they could point their .suspend_noirq() and .resume_noirq() to the same callback routines as their .runtime_suspend() and .runtime_resume(), respectively, but at least some of them require device interrupts to be enabled while the code in those routines is running. It also makes sense to have device suspend-resume callbacks that will be executed with runtime PM disabled and with device interrupts enabled in case someone needs to run some special code in that context during system-wide power transitions. Apart from this, .suspend_noirq() and .resume_noirq() were introduced as a workaround for drivers using shared interrupts and failing to prevent their interrupt handlers from accessing suspended hardware. It appears to be better not to use them for other porposes, or we may have to deal with some serious confusion (which seems to be happening already). For the above reasons, introduce new device suspend/resume phases, "late suspend" and "early resume" (and analogously for hibernation) whose callback will be executed with runtime PM disabled and with device interrupts enabled and whose callback pointers generally may point to runtime suspend/resume routines. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mark Brown Reviewed-by: Kevin Hilman --- kernel/kexec.c | 8 ++++---- kernel/power/hibernate.c | 24 ++++++++++++------------ kernel/power/main.c | 8 ++++++-- kernel/power/suspend.c | 4 ++-- 4 files changed, 24 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 7b0886786701..a6a675cb9818 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1546,13 +1546,13 @@ int kernel_kexec(void) if (error) goto Resume_console; /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_noirq(). We *must* call - * dpm_suspend_noirq() now. Otherwise, drivers for + * but *not* dpm_suspend_end(). We *must* call + * dpm_suspend_end() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1579,7 +1579,7 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6d6d28870335..a5d4cf0aa03e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image - * and execute the drivers' .thaw_noirq() callbacks. + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. * * Control reappears in this routine after the subsequent restore. */ @@ -254,7 +254,7 @@ static int create_image(int platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -306,7 +306,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - dpm_resume_noirq(in_suspend ? + dpm_resume_start(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -394,16 +394,16 @@ int hibernation_snapshot(int platform_mode) * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, restore the contents of - * highmem that have not been restored yet from the image and run the low-level - * code that will restore the remaining contents of memory and switch to the - * just restored target kernel. + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel. */ static int resume_target_kernel(bool platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_QUIESCE); + error = dpm_suspend_end(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -460,7 +460,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - dpm_resume_noirq(PMSG_RECOVER); + dpm_resume_start(PMSG_RECOVER); return error; } @@ -518,7 +518,7 @@ int hibernation_platform_enter(void) goto Resume_devices; } - error = dpm_suspend_noirq(PMSG_HIBERNATE); + error = dpm_suspend_end(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -549,7 +549,7 @@ int hibernation_platform_enter(void) Platform_finish: hibernation_ops->finish(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index 9824b41e5a18..8c5014a4e052 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", "success", suspend_stats.success, "fail", suspend_stats.fail, "failed_freeze", suspend_stats.failed_freeze, "failed_prepare", suspend_stats.failed_prepare, "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_late", + suspend_stats.failed_suspend_late, "failed_suspend_noirq", suspend_stats.failed_suspend_noirq, "failed_resume", suspend_stats.failed_resume, + "failed_resume_early", + suspend_stats.failed_resume_early, "failed_resume_noirq", suspend_stats.failed_resume_noirq); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4fd51beed879..560a639614a1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -147,7 +147,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - error = dpm_suspend_noirq(PMSG_SUSPEND); + error = dpm_suspend_end(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platform_finish; @@ -189,7 +189,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_ops->wake) suspend_ops->wake(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); Platform_finish: if (suspend_ops->finish) -- cgit From d031e1de2c5ba91e67ed83f6adf624543ab2b03d Mon Sep 17 00:00:00 2001 From: Alex Frid Date: Sun, 29 Jan 2012 20:39:25 +0100 Subject: PM / QoS: Simplify PM QoS expansion/merge - Replace class ID #define with enumeration - Loop through PM QoS objects during initialization (rather than initializing them one-by-one) Signed-off-by: Alex Frid Reviewed-by: Antti Miettinen Reviewed-by: Diwakar Tundlam Reviewed-by: Scott Williams Reviewed-by: Yu-Huan Hsu Acked-by: markgross Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 995e3bd3417b..d6d6dbd1ecc0 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, static int __init pm_qos_power_init(void) { int ret = 0; + int i; - ret = register_pm_qos_misc(&cpu_dma_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); - return ret; - } - ret = register_pm_qos_misc(&network_lat_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); - return ret; + BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + + for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { + ret = register_pm_qos_misc(pm_qos_array[i]); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: %s setup failed\n", + pm_qos_array[i]->name); + return ret; + } } - ret = register_pm_qos_misc(&network_throughput_pm_qos); - if (ret < 0) - printk(KERN_ERR - "pm_qos_param: network_throughput setup failed\n"); return ret; } -- cgit From 61d1d219c4c0761059236a46867bc49943c4d29d Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Mon, 30 Jan 2012 12:51:56 -0800 Subject: cgroup: remove extra calls to find_existing_css_set In cgroup_attach_proc, we indirectly call find_existing_css_set 3 times. It is an expensive call so we want to call it a minimum of times. This patch only calls it once and stores the result so that it can be used later on when we call cgroup_task_migrate. This required modifying cgroup_task_migrate to take the new css_set (which we obtained from find_css_set) as a parameter. The nice side effect of this is that cgroup_task_migrate is now identical for cgroup_attach_task and cgroup_attach_proc. It also now returns a void since it can never fail. Changes in V5: * https://lkml.org/lkml/2012/1/20/344 (Tejun Heo) * Remove css_set_refs Changes in V4: * https://lkml.org/lkml/2011/12/22/421 (Li Zefan) * Avoid GFP_KERNEL (sleep) in rcu_read_lock by getting css_set in a separate loop not under an rcu_read_lock Changes in V3: * https://lkml.org/lkml/2011/12/22/13 (Li Zefan) * Fixed earlier bug by creating a seperate patch to remove tasklist_lock Changes in V2: * https://lkml.org/lkml/2011/12/20/372 (Tejun Heo) * Move find_css_set call into loop which creates the flex array * Author * Kill css_set_refs and use group_size instead * Fix an off-by-one error in counting css_set refs * Add a retval check in out_list_teardown Signed-off-by: Mandeep Singh Baines Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Frederic Weisbecker Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 140 +++++++++++--------------------------------------------- 1 file changed, 27 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1626152dcc1e..43a224f167b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1763,6 +1763,7 @@ EXPORT_SYMBOL_GPL(cgroup_path); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; + struct css_set *cg; }; struct cgroup_taskset { @@ -1843,11 +1844,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * will already exist. If not set, this function might sleep, and can fail with * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. */ -static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, - struct task_struct *tsk, bool guarantee) +static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; - struct css_set *newcg; /* * We are synchronized through threadgroup_lock() against PF_EXITING @@ -1857,23 +1857,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, WARN_ON_ONCE(tsk->flags & PF_EXITING); oldcg = tsk->cgroups; - /* locate or allocate a new css_set for this task. */ - if (guarantee) { - /* we know the css_set we want already exists. */ - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - read_lock(&css_set_lock); - newcg = find_existing_css_set(oldcg, cgrp, template); - BUG_ON(!newcg); - get_css_set(newcg); - read_unlock(&css_set_lock); - } else { - might_sleep(); - /* find_css_set will give us newcg already referenced. */ - newcg = find_css_set(oldcg, cgrp); - if (!newcg) - return -ENOMEM; - } - task_lock(tsk); rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1892,7 +1875,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, put_css_set(oldcg); set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - return 0; } /** @@ -1910,6 +1892,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; struct cgroup_taskset tset = { }; + struct css_set *newcg; /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) @@ -1939,9 +1922,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); - if (retval) + newcg = find_css_set(tsk->cgroups, cgrp); + if (!newcg) { + retval = -ENOMEM; goto out; + } + + cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); for_each_subsys(root, ss) { if (ss->attach) @@ -1997,66 +1984,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -/* - * cgroup_attach_proc works in two stages, the first of which prefetches all - * new css_sets needed (to make sure we have enough memory before committing - * to the move) and stores them in a list of entries of the following type. - * TODO: possible optimization: use css_set->rcu_head for chaining instead - */ -struct cg_list_entry { - struct css_set *cg; - struct list_head links; -}; - -static bool css_set_check_fetched(struct cgroup *cgrp, - struct task_struct *tsk, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - read_lock(&css_set_lock); - newcg = find_existing_css_set(cg, cgrp, template); - read_unlock(&css_set_lock); - - /* doesn't exist at all? */ - if (!newcg) - return false; - /* see if it's already in the list */ - list_for_each_entry(cg_entry, newcg_list, links) - if (cg_entry->cg == newcg) - return true; - - /* not found */ - return false; -} - -/* - * Find the new css_set and store it in the list in preparation for moving the - * given task to the given cgroup. Returns 0 or -ENOMEM. - */ -static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - - /* ensure a new css_set will exist for this thread */ - newcg = find_css_set(cg, cgrp); - if (!newcg) - return -ENOMEM; - /* add it to the list */ - cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); - if (!cg_entry) { - put_css_set(newcg); - return -ENOMEM; - } - cg_entry->cg = newcg; - list_add(&cg_entry->links, newcg_list); - return 0; -} - /** * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup * @cgrp: the cgroup to attach to @@ -2070,20 +1997,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; /* guaranteed to be initialized later, but the compiler needs this */ - struct css_set *oldcg; struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; - /* - * we need to make sure we have css_sets for all the tasks we're - * going to move -before- we actually start moving them, so that in - * case we get an ENOMEM we can bail out before making any changes. - */ - struct list_head newcg_list; - struct cg_list_entry *cg_entry, *temp_nobe; /* * step 0: in order to do expensive, possibly blocking operations for @@ -2119,15 +2038,15 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. - */ ent.task = tsk; ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) continue; + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; @@ -2160,17 +2079,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 2: make sure css_sets exist for all threads to be migrated. * we use find_css_set, which allocates a new one if necessary. */ - INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - oldcg = tc->task->cgroups; - - /* if we don't already have it in the list get a new one */ - if (!css_set_check_fetched(cgrp, tc->task, oldcg, - &newcg_list)) { - retval = css_set_prefetch(cgrp, oldcg, &newcg_list); - if (retval) - goto out_list_teardown; + tc->cg = find_css_set(tc->task->cgroups, cgrp); + if (!tc->cg) { + retval = -ENOMEM; + goto out_put_css_set_refs; } } @@ -2181,8 +2095,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); - BUG_ON(retval); + cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); } /* nothing is sensitive to fork() after this point. */ @@ -2200,15 +2113,16 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) synchronize_rcu(); cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; -out_list_teardown: - /* clean up the list of prefetched css_sets. */ - list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { - list_del(&cg_entry->links); - put_css_set(cg_entry->cg); - kfree(cg_entry); +out_put_css_set_refs: + if (retval) { + for (i = 0; i < group_size; i++) { + tc = flex_array_get(group, i); + if (!tc->cg) + break; + put_css_set(tc->cg); + } } out_cancel_attach: - /* same deal as in cgroup_attach_task */ if (retval) { for_each_subsys(root, ss) { if (ss == failed_ss) -- cgit From 761b3ef50e1c2649cffbfa67a4dcb2dcdb7982ed Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 31 Jan 2012 13:47:36 +0800 Subject: cgroup: remove cgroup_subsys argument from callbacks The argument is not used at all, and it's not necessary, because a specific callback handler of course knows which subsys it belongs to. Now only ->pupulate() takes this argument, because the handlers of this callback always call cgroup_add_file()/cgroup_add_files(). So we reduce a few lines of code, though the shrinking of object size is minimal. 16 files changed, 113 insertions(+), 162 deletions(-) text data bss dec hex filename 5486240 656987 7039960 13183187 c928d3 vmlinux.o.orig 5486170 656987 7039960 13183117 c9288d vmlinux.o Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 43 +++++++++++++++++++++---------------------- kernel/cgroup_freezer.c | 11 ++++------- kernel/cpuset.c | 16 +++++----------- kernel/events/core.c | 13 +++++-------- kernel/sched/core.c | 20 ++++++++------------ 5 files changed, 43 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 43a224f167b5..865d89a580c7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { - ret = ss->pre_destroy(ss, cgrp); + ret = ss->pre_destroy(cgrp); if (ret) break; } @@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(ss, cgrp); + ss->bind(cgrp); mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { @@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); if (ss->bind) - ss->bind(ss, dummytop); + ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; @@ -1908,7 +1908,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1932,7 +1932,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } synchronize_rcu(); @@ -1954,7 +1954,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } return retval; @@ -2067,7 +2067,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2104,7 +2104,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } /* @@ -2128,7 +2128,7 @@ out_cancel_attach: if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } out_free_group_list: @@ -3756,7 +3756,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(ss, cgrp); + struct cgroup_subsys_state *css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); @@ -3770,7 +3770,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) - ss->post_clone(ss, cgrp); + ss->post_clone(cgrp); } cgroup_lock_hierarchy(root); @@ -3804,7 +3804,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); } mutex_unlock(&cgroup_mutex); @@ -4028,7 +4028,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(ss, dummytop); + css = ss->create(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4117,7 +4117,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * no ss->create seems to need anything important in the ss struct, so * this can happen first (i.e. before the rootnode attachment). */ - css = ss->create(ss, dummytop); + css = ss->create(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ subsys[i] = NULL; @@ -4135,7 +4135,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) int ret = cgroup_init_idr(ss, css); if (ret) { dummytop->subsys[ss->subsys_id] = NULL; - ss->destroy(ss, dummytop); + ss->destroy(dummytop); subsys[i] = NULL; mutex_unlock(&cgroup_mutex); return ret; @@ -4233,7 +4233,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * pointer to find their state. note that this also takes care of * freeing the css_id. */ - ss->destroy(ss, dummytop); + ss->destroy(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4509,7 +4509,7 @@ void cgroup_fork_callbacks(struct task_struct *child) for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) - ss->fork(ss, child); + ss->fork(child); } } } @@ -4611,7 +4611,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; struct cgroup *cgrp = task_cgroup(tsk, i); - ss->exit(ss, cgrp, old_cgrp, tsk); + ss->exit(cgrp, old_cgrp, tsk); } } } @@ -5066,8 +5066,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *debug_create(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5077,7 +5076,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, return css; } -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void debug_destroy(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fc0646b78a64..f86e93920b62 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys; * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) * sighand->siglock */ -static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) { struct freezer *freezer; @@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, return &freezer->css; } -static void freezer_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void freezer_destroy(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); @@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task) * a write to that file racing against an attach, and hence the * can_attach() result will remain valid until the attach completes. */ -static int freezer_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, +static int freezer_can_attach(struct cgroup *new_cgroup, struct cgroup_taskset *tset) { struct freezer *freezer; @@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, return 0; } -static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +static void freezer_fork(struct task_struct *task) { struct freezer *freezer; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a09ac2b9a661..5d575836dba6 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1399,8 +1399,7 @@ static nodemask_t cpuset_attach_nodemask_from; static nodemask_t cpuset_attach_nodemask_to; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; @@ -1436,8 +1435,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct mm_struct *mm; struct task_struct *task; @@ -1833,8 +1831,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) * (and likewise for mems) to the new cgroup. Called with cgroup_mutex * held. */ -static void cpuset_post_clone(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void cpuset_post_clone(struct cgroup *cgroup) { struct cgroup *parent, *child; struct cpuset *cs, *parent_cs; @@ -1857,13 +1854,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, /* * cpuset_create - create a cpuset - * ss: cpuset cgroup subsystem * cont: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_create( - struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) { struct cpuset *cs; struct cpuset *parent; @@ -1902,7 +1896,7 @@ static struct cgroup_subsys_state *cpuset_create( * will call async_rebuild_sched_domains(). */ -static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void cpuset_destroy(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); diff --git a/kernel/events/core.c b/kernel/events/core.c index a8f4ac001a00..a5d1ee92b0d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6906,8 +6906,7 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create( - struct cgroup_subsys *ss, struct cgroup *cont) +static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) { struct perf_cgroup *jc; @@ -6924,8 +6923,7 @@ static struct cgroup_subsys_state *perf_cgroup_create( return &jc->css; } -static void perf_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void perf_cgroup_destroy(struct cgroup *cont) { struct perf_cgroup *jc; jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), @@ -6941,8 +6939,7 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -6950,8 +6947,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df00cb09263e..ff12f7216062 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7530,8 +7530,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) struct task_group, css); } -static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) { struct task_group *tg, *parent; @@ -7548,15 +7547,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &tg->css; } -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpu_cgroup_destroy(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); sched_destroy_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7574,7 +7572,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7584,8 +7582,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -7935,8 +7933,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { */ /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) { struct cpuacct *ca; @@ -7966,8 +7963,7 @@ out: } /* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpuacct_destroy(struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); -- cgit From a80b83b7b8456e9b475346c2e01d7e210883208c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 3 Feb 2012 00:19:07 -0800 Subject: Input: add infrastructure for selecting clockid for event time stamps As noted by Arve and others, since wall time can jump backwards, it is difficult to use for input because one cannot determine if one event occurred before another or for how long a key was pressed. However, the timestamp field is part of the kernel ABI, and cannot be changed without possibly breaking existing users. This patch adds a new IOCTL that allows a clockid to be set in the evdev_client struct that will specify which time base to use for event timestamps (ie: CLOCK_MONOTONIC instead of CLOCK_REALTIME). For now we only support CLOCK_MONOTONIC and CLOCK_REALTIME, but in the future we could support other clockids if appropriate. The default remains CLOCK_REALTIME, so we don't change the ABI. Signed-off-by: John Stultz Reviewed-by: Daniel Kurtz Signed-off-by: Dmitry Torokhov --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e8507..169479994755 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1140,6 +1140,8 @@ ktime_t ktime_get_monotonic_offset(void) } while (read_seqretry(&xtime_lock, seq)); return timespec_to_ktime(wtom); } +EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); + /** * xtime_update() - advances the timekeeping infrastructure -- cgit From 241057486646dd42278538218376c79aae2c359f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 3 Feb 2012 21:42:39 +0800 Subject: kernel/resource.c: move EXPORT_SYMBOL right after definition EXPORT_SYMBOL(adjust_resource) should be right after adjust_resource(). Signed-off-by: WANG Cong Signed-off-by: Jiri Kosina --- kernel/resource.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 7640b3a947d0..7e8ea66a8c01 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t write_unlock(&resource_lock); return result; } +EXPORT_SYMBOL(adjust_resource); static void __init __reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, @@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root, write_unlock(&resource_lock); } -EXPORT_SYMBOL(adjust_resource); - /** * resource_alignment - calculate resource's alignment * @res: resource pointer -- cgit From 1a2a4d06e1e95260c470ebe3a945f61bbe8c1fd8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 Dec 2011 12:17:03 -0800 Subject: security: create task_free security callback The current LSM interface to cred_free is not sufficient for allowing an LSM to track the life and death of a task. This patch adds the task_free hook so that an LSM can clean up resources on task death. Signed-off-by: Kees Cook Signed-off-by: James Morris --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1b2ef3c23ae4..f0e7781ba9b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -192,6 +192,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); -- cgit From 8916e3702ec422b57cc549fbae3986106292100f Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Sat, 4 Feb 2012 22:26:13 +0100 Subject: PM / Suspend: Avoid code duplication in suspend statistics update The code if (error) { suspend_stats.fail++; dpm_save_failed_errno(error); } else suspend_stats.success++; Appears in the kernel/power/main.c and kernel/power/suspend.c. This patch just creates a new function to avoid duplicated code. Suggested-by: Srivatsa S. Bhat Signed-off-by: Marcos Paulo de Souza Acked-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 6 +----- kernel/power/suspend.c | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 8c5014a4e052..b1e324878d5f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -296,11 +296,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } if (state < PM_SUSPEND_MAX && *s) { error = enter_state(state); - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else - suspend_stats.success++; + suspend_stats_update(error); } #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 560a639614a1..03bc92b42750 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -321,11 +321,7 @@ int pm_suspend(suspend_state_t state) int ret; if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { ret = enter_state(state); - if (ret) { - suspend_stats.fail++; - dpm_save_failed_errno(ret); - } else - suspend_stats.success++; + suspend_stats_update(ret); return ret; } return -EINVAL; -- cgit From 51d6ff7acd920379f54d0be4dbe844a46178a65f Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 4 Feb 2012 22:26:38 +0100 Subject: PM / Hibernate: Thaw kernel threads in hibernation_snapshot() in error/test path In the hibernation call path, the kernel threads are frozen inside hibernation_snapshot(). If we happen to encounter an error further down the road or if we are exiting early due to a successful freezer test, then thaw kernel threads before returning to the caller. Signed-off-by: Srivatsa S. Bhat Acked-by: Tejun Heo Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 ++++-- kernel/power/user.c | 8 ++------ 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a5d4cf0aa03e..c6dee739080c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode) * successful freezer test. */ freezer_test_done = true; - goto Cleanup; + goto Thaw; } error = dpm_prepare(PMSG_FREEZE); if (error) { dpm_complete(PMSG_RECOVER); - goto Cleanup; + goto Thaw; } suspend_console(); @@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode) platform_end(platform_mode); return error; + Thaw: + thaw_kernel_threads(); Cleanup: swsusp_free(); goto Close; diff --git a/kernel/power/user.c b/kernel/power/user.c index 3e100075b13c..7bee91f9af51 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,16 +249,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (error) { - thaw_kernel_threads(); - } else { + if (!error) { error = put_user(in_suspend, (int __user *)arg); if (!error && !freezer_test_done) data->ready = 1; - if (freezer_test_done) { + if (freezer_test_done) freezer_test_done = false; - thaw_kernel_threads(); - } } break; -- cgit From a556d5b58345ccf51826b9ceac078072f830738b Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 4 Feb 2012 23:39:56 +0100 Subject: PM / Hibernate: Refactor and simplify freezer_test_done The code related to 'freezer_test_done' is needlessly convoluted. Refactor the code and simplify the implementation. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 10 +++++----- kernel/power/user.c | 6 ++---- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c6dee739080c..72baaf011fb7 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -629,12 +629,8 @@ int hibernate(void) goto Finish; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (error) - goto Thaw; - if (freezer_test_done) { - freezer_test_done = false; + if (error || freezer_test_done) goto Thaw; - } if (in_suspend) { unsigned int flags = 0; @@ -659,6 +655,10 @@ int hibernate(void) Thaw: thaw_processes(); + + /* Don't bother checking whether freezer_test_done is true */ + freezer_test_done = false; + Finish: free_basic_memory_bitmaps(); usermodehelper_enable(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 7bee91f9af51..33c4329205af 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -251,10 +251,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = hibernation_snapshot(data->platform_support); if (!error) { error = put_user(in_suspend, (int __user *)arg); - if (!error && !freezer_test_done) - data->ready = 1; - if (freezer_test_done) - freezer_test_done = false; + data->ready = !freezer_test_done && !error; + freezer_test_done = false; } break; -- cgit From a9b542ee607a8afafa9447292394959fc84ea650 Mon Sep 17 00:00:00 2001 From: Jean Pihet Date: Mon, 13 Feb 2012 16:23:42 +0100 Subject: PM / QoS: unconditionally build the feature The PM QoS feature originally didn't depend on CONFIG_PM, which was mistakenly changed by commit e8db0be1245de16a6cc6365506abc392c3c212d4 PM QoS: Move and rename the implementation files Later, commit d020283dc694c9ec31b410f522252f7a8397e67d PM / QoS: CPU C-state breakage with PM Qos change partially fixed that by introducing a static inline definition of pm_qos_request(), but that still didn't allow user space to use the PM QoS interface if CONFIG_PM was unset (which had been possible before). For this reason, remove the dependency of PM QoS on CONFIG_PM to make it work (as intended) with CONFIG_PM unset. [rjw: Replaced the original changelog with a new one.] Signed-off-by: Jean Pihet Reported-by: Venkatesh Pallipadi Signed-off-by: Rafael J. Wysocki --- kernel/power/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 07e0e28ffba7..66d808ec5252 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,7 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -obj-$(CONFIG_PM) += main.o qos.o +obj-y += qos.o +obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o -- cgit From 6c83b4818dd65eb17e633b6b629a81da7bed90b3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 11 Feb 2012 00:00:34 +0100 Subject: PM / Sleep: Do not check wakeup too often in try_to_freeze_tasks() Use the observation that it is more efficient to check the wakeup variable once before the loop reporting tasks that were not frozen in try_to_freeze_tasks() than to do that in every step of that loop. Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 7e426459e60a..6aeb5efe00eb 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -98,13 +98,15 @@ static int try_to_freeze_tasks(bool user_only) elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!wakeup && !freezer_should_skip(p) && - p != current && freezing(p) && !frozen(p)) - sched_show_task(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + if (!wakeup) { + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p != current && !freezer_should_skip(p) + && freezing(p) && !frozen(p)) + sched_show_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } } else { printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, elapsed_csecs % 100); -- cgit From 6f585f750d792652f33b6e85b1ee205be4b5e572 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 11 Feb 2012 22:40:23 +0100 Subject: PM / Sleep: Remove unnecessary label from suspend_freeze_processes() The Finish label in suspend_freeze_processes() is in fact unnecessary and makes the function look more complicated than it really is, so remove that label (along with a few empty lines). Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat --- kernel/power/power.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 21724eee5206..398d42b48e9e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -234,16 +234,14 @@ static inline int suspend_freeze_processes(void) int error; error = freeze_processes(); - /* * freeze_processes() automatically thaws every task if freezing * fails. So we need not do anything extra upon error. */ if (error) - goto Finish; + return error; error = freeze_kernel_threads(); - /* * freeze_kernel_threads() thaws only kernel threads upon freezing * failure. So we have to thaw the userspace tasks ourselves. @@ -251,7 +249,6 @@ static inline int suspend_freeze_processes(void) if (error) thaw_processes(); - Finish: return error; } -- cgit From 191c542442fdf53cc3c496c00be13367fd9cd42d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 13 Feb 2012 03:58:52 +0000 Subject: mm: collapse security_vm_enough_memory() variants into a single function Collapse security_vm_enough_memory() variants into a single function. Signed-off-by: Al Viro Signed-off-by: James Morris --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f0e7781ba9b4..d5ebddf317a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -355,7 +355,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - if (security_vm_enough_memory(len)) + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } -- cgit From 4040153087478993cbf0809f444400a3c808074c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 13 Feb 2012 03:58:52 +0000 Subject: security: trim security.h Trim security.h Signed-off-by: Al Viro Signed-off-by: James Morris --- kernel/cred.c | 1 + kernel/exit.c | 1 + kernel/sched/core.c | 1 + kernel/sysctl.c | 1 + 4 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 5791612a4045..97b36eeca4c9 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #if 0 diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6a..5ad867a3685e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..78682bfb3405 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f487f257e05e..11d53046b905 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include -- cgit From e1964c50a83d1ce53731c88271d12ac92292a880 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:48 -0700 Subject: irq_domain: Be less verbose irq_domain printk's too much. Drop some output. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Thomas Gleixner Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1f9e26526b69..cc2cd43ec740 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -170,13 +170,11 @@ void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start) { struct device_node *node; - pr_info("looking for phys_base=%llx, irq_start=%i\n", + pr_debug("looking for phys_base=%llx, irq_start=%i\n", (unsigned long long) phys_base, (int) irq_start); node = of_find_matching_node_by_address(NULL, match, phys_base); if (node) irq_domain_add_simple(node, irq_start); - else - pr_info("no node found\n"); } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); #endif /* CONFIG_OF_IRQ */ -- cgit From 7bb69bade0d41715bdf1b24f5ef0b8f798769fe9 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:48 -0700 Subject: irq_domain: Make irq_domain structure match powerpc's irq_host Part of the series to unify the irq remapping mechanisms in the kernel. A follow up patch will copy the powerpc implementation into kernel/irq/irqdomain.c, which will be a lot easier if the structures are identical. Where they differ, I've chose to use the powerpc names since there is a lot more code using those names. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cc2cd43ec740..509adb8762d7 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -43,7 +43,7 @@ void irq_domain_add(struct irq_domain *domain) } mutex_lock(&irq_domain_mutex); - list_add(&domain->list, &irq_domain_list); + list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); } @@ -57,7 +57,7 @@ void irq_domain_del(struct irq_domain *domain) int hwirq, irq; mutex_lock(&irq_domain_mutex); - list_del(&domain->list); + list_del(&domain->link); mutex_unlock(&irq_domain_mutex); /* Clear the irq_domain assignments */ @@ -88,10 +88,10 @@ unsigned int irq_create_of_mapping(struct device_node *controller, /* Find a domain which can translate the irq spec */ mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, list) { - if (!domain->ops->dt_translate) + list_for_each_entry(domain, &irq_domain_list, link) { + if (!domain->ops->xlate) continue; - rc = domain->ops->dt_translate(domain, controller, + rc = domain->ops->xlate(domain, controller, intspec, intsize, &hwirq, &type); if (rc == 0) break; @@ -126,7 +126,7 @@ void irq_dispose_mapping(unsigned int irq) } EXPORT_SYMBOL_GPL(irq_dispose_mapping); -int irq_domain_simple_dt_translate(struct irq_domain *d, +int irq_domain_simple_xlate(struct irq_domain *d, struct device_node *controller, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type) @@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(irq_domain_generate_simple); struct irq_domain_ops irq_domain_simple_ops = { #ifdef CONFIG_OF_IRQ - .dt_translate = irq_domain_simple_dt_translate, + .xlate = irq_domain_simple_xlate, #endif /* CONFIG_OF_IRQ */ }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -- cgit From cc79ca691c292e9fd44f589c7940b9654e22f2f6 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 16 Feb 2012 01:37:49 -0700 Subject: irq_domain: Move irq_domain code from powerpc to kernel/irq This patch only moves the code. It doesn't make any changes, and the code is still only compiled for powerpc. Follow-on patches will generalize the code for other architectures. Signed-off-by: Grant Likely Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 600 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 600 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 509adb8762d7..f551bc1d3167 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,14 +1,612 @@ +#include +#include +#include #include +#include #include #include #include #include #include +#include #include +#include +#include static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); +#ifdef CONFIG_PPC +static DEFINE_MUTEX(revmap_trees_mutex); +static unsigned int irq_virq_count = NR_IRQS; +static struct irq_domain *irq_default_host; + +static int default_irq_host_match(struct irq_domain *h, struct device_node *np) +{ + return h->of_node != NULL && h->of_node == np; +} + +/** + * irq_alloc_host() - Allocate a new irq_domain data structure + * @of_node: optional device-tree node of the interrupt controller + * @revmap_type: type of reverse mapping to use + * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map + * @ops: map/unmap host callbacks + * @inval_irq: provide a hw number in that host space that is always invalid + * + * Allocates and initialize and irq_domain structure. Note that in the case of + * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns + * for all legacy interrupts except 0 (which is always the invalid irq for + * a legacy controller). For a IRQ_DOMAIN_MAP_LINEAR, the map is allocated by + * this call as well. For a IRQ_DOMAIN_MAP_TREE, the radix tree will be + * allocated later during boot automatically (the reverse mapping will use the + * slow path until that happens). + */ +struct irq_domain *irq_alloc_host(struct device_node *of_node, + unsigned int revmap_type, + unsigned int revmap_arg, + struct irq_domain_ops *ops, + irq_hw_number_t inval_irq) +{ + struct irq_domain *host, *h; + unsigned int size = sizeof(struct irq_domain); + unsigned int i; + unsigned int *rmap; + + /* Allocate structure and revmap table if using linear mapping */ + if (revmap_type == IRQ_DOMAIN_MAP_LINEAR) + size += revmap_arg * sizeof(unsigned int); + host = kzalloc(size, GFP_KERNEL); + if (host == NULL) + return NULL; + + /* Fill structure */ + host->revmap_type = revmap_type; + host->inval_irq = inval_irq; + host->ops = ops; + host->of_node = of_node_get(of_node); + + if (host->ops->match == NULL) + host->ops->match = default_irq_host_match; + + mutex_lock(&irq_domain_mutex); + /* Make sure only one legacy controller can be created */ + if (revmap_type == IRQ_DOMAIN_MAP_LEGACY) { + list_for_each_entry(h, &irq_domain_list, link) { + if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { + mutex_unlock(&irq_domain_mutex); + of_node_put(host->of_node); + kfree(host); + return NULL; + } + } + } + list_add(&host->link, &irq_domain_list); + mutex_unlock(&irq_domain_mutex); + + /* Additional setups per revmap type */ + switch(revmap_type) { + case IRQ_DOMAIN_MAP_LEGACY: + /* 0 is always the invalid number for legacy */ + host->inval_irq = 0; + /* setup us as the host for all legacy interrupts */ + for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { + struct irq_data *irq_data = irq_get_irq_data(i); + irq_data->hwirq = i; + irq_data->domain = host; + + /* Legacy flags are left to default at this point, + * one can then use irq_create_mapping() to + * explicitly change them + */ + ops->map(host, i, i); + + /* Clear norequest flags */ + irq_clear_status_flags(i, IRQ_NOREQUEST); + } + break; + case IRQ_DOMAIN_MAP_LINEAR: + rmap = (unsigned int *)(host + 1); + for (i = 0; i < revmap_arg; i++) + rmap[i] = NO_IRQ; + host->revmap_data.linear.size = revmap_arg; + host->revmap_data.linear.revmap = rmap; + break; + case IRQ_DOMAIN_MAP_TREE: + INIT_RADIX_TREE(&host->revmap_data.tree, GFP_KERNEL); + break; + default: + break; + } + + pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host); + + return host; +} + +/** + * irq_find_host() - Locates a domain for a given device node + * @node: device-tree node of the interrupt controller + */ +struct irq_domain *irq_find_host(struct device_node *node) +{ + struct irq_domain *h, *found = NULL; + + /* We might want to match the legacy controller last since + * it might potentially be set to match all interrupts in + * the absence of a device node. This isn't a problem so far + * yet though... + */ + mutex_lock(&irq_domain_mutex); + list_for_each_entry(h, &irq_domain_list, link) + if (h->ops->match(h, node)) { + found = h; + break; + } + mutex_unlock(&irq_domain_mutex); + return found; +} +EXPORT_SYMBOL_GPL(irq_find_host); + +/** + * irq_set_default_host() - Set a "default" irq domain + * @host: default host pointer + * + * For convenience, it's possible to set a "default" domain that will be used + * whenever NULL is passed to irq_create_mapping(). It makes life easier for + * platforms that want to manipulate a few hard coded interrupt numbers that + * aren't properly represented in the device-tree. + */ +void irq_set_default_host(struct irq_domain *host) +{ + pr_debug("irq: Default host set to @0x%p\n", host); + + irq_default_host = host; +} + +/** + * irq_set_virq_count() - Set the maximum number of linux irqs + * @count: number of linux irqs, capped with NR_IRQS + * + * This is mainly for use by platforms like iSeries who want to program + * the virtual irq number in the controller to avoid the reverse mapping + */ +void irq_set_virq_count(unsigned int count) +{ + pr_debug("irq: Trying to set virq count to %d\n", count); + + BUG_ON(count < NUM_ISA_INTERRUPTS); + if (count < NR_IRQS) + irq_virq_count = count; +} + +static int irq_setup_virq(struct irq_domain *host, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + irq_data->hwirq = hwirq; + irq_data->domain = host; + if (host->ops->map(host, virq, hwirq)) { + pr_debug("irq: -> mapping failed, freeing\n"); + irq_data->domain = NULL; + irq_data->hwirq = 0; + return -1; + } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); + + return 0; +} + +/** + * irq_create_direct_mapping() - Allocate an irq for direct mapping + * @host: domain to allocate the irq for or NULL for default host + * + * This routine is used for irq controllers which can choose the hardware + * interrupt numbers they generate. In such a case it's simplest to use + * the linux irq as the hardware interrupt number. + */ +unsigned int irq_create_direct_mapping(struct irq_domain *host) +{ + unsigned int virq; + + if (host == NULL) + host = irq_default_host; + + BUG_ON(host == NULL); + WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP); + + virq = irq_alloc_desc_from(1, 0); + if (virq == NO_IRQ) { + pr_debug("irq: create_direct virq allocation failed\n"); + return NO_IRQ; + } + if (virq >= irq_virq_count) { + pr_err("ERROR: no free irqs available below %i maximum\n", + irq_virq_count); + irq_free_desc(virq); + return 0; + } + + pr_debug("irq: create_direct obtained virq %d\n", virq); + + if (irq_setup_virq(host, virq, virq)) { + irq_free_desc(virq); + return NO_IRQ; + } + + return virq; +} + +/** + * irq_create_mapping() - Map a hardware interrupt into linux irq space + * @host: host owning this hardware interrupt or NULL for default host + * @hwirq: hardware irq number in that host space + * + * Only one mapping per hardware interrupt is permitted. Returns a linux + * irq number. + * If the sense/trigger is to be specified, set_irq_type() should be called + * on the number returned from that call. + */ +unsigned int irq_create_mapping(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + unsigned int virq, hint; + + pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq); + + /* Look for default host if nececssary */ + if (host == NULL) + host = irq_default_host; + if (host == NULL) { + printk(KERN_WARNING "irq_create_mapping called for" + " NULL host, hwirq=%lx\n", hwirq); + WARN_ON(1); + return NO_IRQ; + } + pr_debug("irq: -> using host @%p\n", host); + + /* Check if mapping already exists */ + virq = irq_find_mapping(host, hwirq); + if (virq != NO_IRQ) { + pr_debug("irq: -> existing mapping on virq %d\n", virq); + return virq; + } + + /* Get a virtual interrupt number */ + if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) { + /* Handle legacy */ + virq = (unsigned int)hwirq; + if (virq == 0 || virq >= NUM_ISA_INTERRUPTS) + return NO_IRQ; + return virq; + } else { + /* Allocate a virtual interrupt number */ + hint = hwirq % irq_virq_count; + if (hint == 0) + hint++; + virq = irq_alloc_desc_from(hint, 0); + if (!virq) + virq = irq_alloc_desc_from(1, 0); + if (virq == NO_IRQ) { + pr_debug("irq: -> virq allocation failed\n"); + return NO_IRQ; + } + } + + if (irq_setup_virq(host, virq, hwirq)) { + if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY) + irq_free_desc(virq); + return NO_IRQ; + } + + pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n", + hwirq, host->of_node ? host->of_node->full_name : "null", virq); + + return virq; +} +EXPORT_SYMBOL_GPL(irq_create_mapping); + +unsigned int irq_create_of_mapping(struct device_node *controller, + const u32 *intspec, unsigned int intsize) +{ + struct irq_domain *host; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + unsigned int virq; + + if (controller == NULL) + host = irq_default_host; + else + host = irq_find_host(controller); + if (host == NULL) { + printk(KERN_WARNING "irq: no irq host found for %s !\n", + controller->full_name); + return NO_IRQ; + } + + /* If host has no translation, then we assume interrupt line */ + if (host->ops->xlate == NULL) + hwirq = intspec[0]; + else { + if (host->ops->xlate(host, controller, intspec, intsize, + &hwirq, &type)) + return NO_IRQ; + } + + /* Create mapping */ + virq = irq_create_mapping(host, hwirq); + if (virq == NO_IRQ) + return virq; + + /* Set type if specified and different than the current one */ + if (type != IRQ_TYPE_NONE && + type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) + irq_set_irq_type(virq, type); + return virq; +} +EXPORT_SYMBOL_GPL(irq_create_of_mapping); + +/** + * irq_dispose_mapping() - Unmap an interrupt + * @virq: linux irq number of the interrupt to unmap + */ +void irq_dispose_mapping(unsigned int virq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + struct irq_domain *host; + irq_hw_number_t hwirq; + + if (virq == NO_IRQ || !irq_data) + return; + + host = irq_data->domain; + if (WARN_ON(host == NULL)) + return; + + /* Never unmap legacy interrupts */ + if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return; + + irq_set_status_flags(virq, IRQ_NOREQUEST); + + /* remove chip and handler */ + irq_set_chip_and_handler(virq, NULL, NULL); + + /* Make sure it's completed */ + synchronize_irq(virq); + + /* Tell the PIC about it */ + if (host->ops->unmap) + host->ops->unmap(host, virq); + smp_mb(); + + /* Clear reverse map */ + hwirq = irq_data->hwirq; + switch(host->revmap_type) { + case IRQ_DOMAIN_MAP_LINEAR: + if (hwirq < host->revmap_data.linear.size) + host->revmap_data.linear.revmap[hwirq] = NO_IRQ; + break; + case IRQ_DOMAIN_MAP_TREE: + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&host->revmap_data.tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + break; + } + + /* Destroy map */ + irq_data->hwirq = host->inval_irq; + + irq_free_desc(virq); +} +EXPORT_SYMBOL_GPL(irq_dispose_mapping); + +/** + * irq_find_mapping() - Find a linux irq from an hw irq number. + * @host: domain owning this hardware interrupt + * @hwirq: hardware irq number in that host space + * + * This is a slow path, for use by generic code. It's expected that an + * irq controller implementation directly calls the appropriate low level + * mapping function. + */ +unsigned int irq_find_mapping(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + unsigned int i; + unsigned int hint = hwirq % irq_virq_count; + + /* Look for default host if nececssary */ + if (host == NULL) + host = irq_default_host; + if (host == NULL) + return NO_IRQ; + + /* legacy -> bail early */ + if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return hwirq; + + /* Slow path does a linear search of the map */ + if (hint == 0) + hint = 1; + i = hint; + do { + struct irq_data *data = irq_get_irq_data(i); + if (data && (data->domain == host) && (data->hwirq == hwirq)) + return i; + i++; + if (i >= irq_virq_count) + i = 1; + } while(i != hint); + return NO_IRQ; +} +EXPORT_SYMBOL_GPL(irq_find_mapping); + +/** + * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. + * @host: host owning this hardware interrupt + * @hwirq: hardware irq number in that host space + * + * This is a fast path, for use by irq controller code that uses radix tree + * revmaps + */ +unsigned int irq_radix_revmap_lookup(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + struct irq_data *irq_data; + + if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return irq_find_mapping(host, hwirq); + + /* + * Freeing an irq can delete nodes along the path to + * do the lookup via call_rcu. + */ + rcu_read_lock(); + irq_data = radix_tree_lookup(&host->revmap_data.tree, hwirq); + rcu_read_unlock(); + + /* + * If found in radix tree, then fine. + * Else fallback to linear lookup - this should not happen in practice + * as it means that we failed to insert the node in the radix tree. + */ + return irq_data ? irq_data->irq : irq_find_mapping(host, hwirq); +} + +/** + * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. + * @host: host owning this hardware interrupt + * @virq: linux irq number + * @hwirq: hardware irq number in that host space + * + * This is for use by irq controllers that use a radix tree reverse + * mapping for fast lookup. + */ +void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return; + + if (virq != NO_IRQ) { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); + } +} + +/** + * irq_linear_revmap() - Find a linux irq from a hw irq number. + * @host: host owning this hardware interrupt + * @hwirq: hardware irq number in that host space + * + * This is a fast path, for use by irq controller code that uses linear + * revmaps. It does fallback to the slow path if the revmap doesn't exist + * yet and will create the revmap entry with appropriate locking + */ +unsigned int irq_linear_revmap(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + unsigned int *revmap; + + if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) + return irq_find_mapping(host, hwirq); + + /* Check revmap bounds */ + if (unlikely(hwirq >= host->revmap_data.linear.size)) + return irq_find_mapping(host, hwirq); + + /* Check if revmap was allocated */ + revmap = host->revmap_data.linear.revmap; + if (unlikely(revmap == NULL)) + return irq_find_mapping(host, hwirq); + + /* Fill up revmap with slow path if no mapping found */ + if (unlikely(revmap[hwirq] == NO_IRQ)) + revmap[hwirq] = irq_find_mapping(host, hwirq); + + return revmap[hwirq]; +} + +#ifdef CONFIG_VIRQ_DEBUG +static int virq_debug_show(struct seq_file *m, void *private) +{ + unsigned long flags; + struct irq_desc *desc; + const char *p; + static const char none[] = "none"; + void *data; + int i; + + seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", + "chip name", "chip data", "host name"); + + for (i = 1; i < nr_irqs; i++) { + desc = irq_to_desc(i); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + if (desc->action && desc->action->handler) { + struct irq_chip *chip; + + seq_printf(m, "%5d ", i); + seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); + + chip = irq_desc_get_chip(desc); + if (chip && chip->name) + p = chip->name; + else + p = none; + seq_printf(m, "%-15s ", p); + + data = irq_desc_get_chip_data(desc); + seq_printf(m, "0x%16p ", data); + + if (desc->irq_data.domain->of_node) + p = desc->irq_data.domain->of_node->full_name; + else + p = none; + seq_printf(m, "%s\n", p); + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } + + return 0; +} + +static int virq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, virq_debug_show, inode->i_private); +} + +static const struct file_operations virq_debug_fops = { + .open = virq_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init irq_debugfs_init(void) +{ + if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root, + NULL, &virq_debug_fops) == NULL) + return -ENOMEM; + + return 0; +} +__initcall(irq_debugfs_init); +#endif /* CONFIG_VIRQ_DEBUG */ + +#else /* CONFIG_PPC */ + /** * irq_domain_add() - Register an irq_domain * @domain: ptr to initialized irq_domain structure @@ -185,3 +783,5 @@ struct irq_domain_ops irq_domain_simple_ops = { #endif /* CONFIG_OF_IRQ */ }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +#endif /* !CONFIG_PPC */ -- cgit From 03848373ea741caafab952fb62405ed7fc0c279c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:52 -0700 Subject: irq_domain: remove NO_IRQ from irq domain code zero always means no irq when using irq domains. Get rid of the NO_IRQ references. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f551bc1d3167..8f7b91ce53c4 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -108,7 +108,7 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node, case IRQ_DOMAIN_MAP_LINEAR: rmap = (unsigned int *)(host + 1); for (i = 0; i < revmap_arg; i++) - rmap[i] = NO_IRQ; + rmap[i] = 0; host->revmap_data.linear.size = revmap_arg; host->revmap_data.linear.revmap = rmap; break; @@ -218,9 +218,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host) WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP); virq = irq_alloc_desc_from(1, 0); - if (virq == NO_IRQ) { + if (!virq) { pr_debug("irq: create_direct virq allocation failed\n"); - return NO_IRQ; + return 0; } if (virq >= irq_virq_count) { pr_err("ERROR: no free irqs available below %i maximum\n", @@ -233,7 +233,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host) if (irq_setup_virq(host, virq, virq)) { irq_free_desc(virq); - return NO_IRQ; + return 0; } return virq; @@ -263,13 +263,13 @@ unsigned int irq_create_mapping(struct irq_domain *host, printk(KERN_WARNING "irq_create_mapping called for" " NULL host, hwirq=%lx\n", hwirq); WARN_ON(1); - return NO_IRQ; + return 0; } pr_debug("irq: -> using host @%p\n", host); /* Check if mapping already exists */ virq = irq_find_mapping(host, hwirq); - if (virq != NO_IRQ) { + if (virq) { pr_debug("irq: -> existing mapping on virq %d\n", virq); return virq; } @@ -279,7 +279,7 @@ unsigned int irq_create_mapping(struct irq_domain *host, /* Handle legacy */ virq = (unsigned int)hwirq; if (virq == 0 || virq >= NUM_ISA_INTERRUPTS) - return NO_IRQ; + return 0; return virq; } else { /* Allocate a virtual interrupt number */ @@ -289,16 +289,16 @@ unsigned int irq_create_mapping(struct irq_domain *host, virq = irq_alloc_desc_from(hint, 0); if (!virq) virq = irq_alloc_desc_from(1, 0); - if (virq == NO_IRQ) { + if (!virq) { pr_debug("irq: -> virq allocation failed\n"); - return NO_IRQ; + return 0; } } if (irq_setup_virq(host, virq, hwirq)) { if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY) irq_free_desc(virq); - return NO_IRQ; + return 0; } pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n", @@ -323,7 +323,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, if (host == NULL) { printk(KERN_WARNING "irq: no irq host found for %s !\n", controller->full_name); - return NO_IRQ; + return 0; } /* If host has no translation, then we assume interrupt line */ @@ -332,12 +332,12 @@ unsigned int irq_create_of_mapping(struct device_node *controller, else { if (host->ops->xlate(host, controller, intspec, intsize, &hwirq, &type)) - return NO_IRQ; + return 0; } /* Create mapping */ virq = irq_create_mapping(host, hwirq); - if (virq == NO_IRQ) + if (!virq) return virq; /* Set type if specified and different than the current one */ @@ -358,7 +358,7 @@ void irq_dispose_mapping(unsigned int virq) struct irq_domain *host; irq_hw_number_t hwirq; - if (virq == NO_IRQ || !irq_data) + if (!virq || !irq_data) return; host = irq_data->domain; @@ -387,7 +387,7 @@ void irq_dispose_mapping(unsigned int virq) switch(host->revmap_type) { case IRQ_DOMAIN_MAP_LINEAR: if (hwirq < host->revmap_data.linear.size) - host->revmap_data.linear.revmap[hwirq] = NO_IRQ; + host->revmap_data.linear.revmap[hwirq] = 0; break; case IRQ_DOMAIN_MAP_TREE: mutex_lock(&revmap_trees_mutex); @@ -422,7 +422,7 @@ unsigned int irq_find_mapping(struct irq_domain *host, if (host == NULL) host = irq_default_host; if (host == NULL) - return NO_IRQ; + return 0; /* legacy -> bail early */ if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) @@ -440,7 +440,7 @@ unsigned int irq_find_mapping(struct irq_domain *host, if (i >= irq_virq_count) i = 1; } while(i != hint); - return NO_IRQ; + return 0; } EXPORT_SYMBOL_GPL(irq_find_mapping); @@ -493,7 +493,7 @@ void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq, if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) return; - if (virq != NO_IRQ) { + if (virq) { mutex_lock(&revmap_trees_mutex); radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); @@ -527,7 +527,7 @@ unsigned int irq_linear_revmap(struct irq_domain *host, return irq_find_mapping(host, hwirq); /* Fill up revmap with slow path if no mapping found */ - if (unlikely(revmap[hwirq] == NO_IRQ)) + if (unlikely(!revmap[hwirq])) revmap[hwirq] = irq_find_mapping(host, hwirq); return revmap[hwirq]; -- cgit From 68700650e71b6bb6636673f4f9c8ec807353d8d6 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:53 -0700 Subject: irq_domain: Remove references to old irq_host names No functional changes. Replaces non-exported references to 'host' with domain. Does not change any symbol names referenced by other .c files. Signed-off-by: Grant Likely Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 219 ++++++++++++++++++++++++------------------------- 1 file changed, 108 insertions(+), 111 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8f7b91ce53c4..432d292b33f8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -19,11 +19,11 @@ static DEFINE_MUTEX(irq_domain_mutex); #ifdef CONFIG_PPC static DEFINE_MUTEX(revmap_trees_mutex); static unsigned int irq_virq_count = NR_IRQS; -static struct irq_domain *irq_default_host; +static struct irq_domain *irq_default_domain; -static int default_irq_host_match(struct irq_domain *h, struct device_node *np) +static int default_irq_domain_match(struct irq_domain *d, struct device_node *np) { - return h->of_node != NULL && h->of_node == np; + return d->of_node != NULL && d->of_node == np; } /** @@ -31,8 +31,8 @@ static int default_irq_host_match(struct irq_domain *h, struct device_node *np) * @of_node: optional device-tree node of the interrupt controller * @revmap_type: type of reverse mapping to use * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map - * @ops: map/unmap host callbacks - * @inval_irq: provide a hw number in that host space that is always invalid + * @ops: map/unmap domain callbacks + * @inval_irq: provide a hw number in that domain space that is always invalid * * Allocates and initialize and irq_domain structure. Note that in the case of * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns @@ -48,7 +48,7 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node, struct irq_domain_ops *ops, irq_hw_number_t inval_irq) { - struct irq_domain *host, *h; + struct irq_domain *domain, *h; unsigned int size = sizeof(struct irq_domain); unsigned int i; unsigned int *rmap; @@ -56,18 +56,18 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node, /* Allocate structure and revmap table if using linear mapping */ if (revmap_type == IRQ_DOMAIN_MAP_LINEAR) size += revmap_arg * sizeof(unsigned int); - host = kzalloc(size, GFP_KERNEL); - if (host == NULL) + domain = kzalloc(size, GFP_KERNEL); + if (domain == NULL) return NULL; /* Fill structure */ - host->revmap_type = revmap_type; - host->inval_irq = inval_irq; - host->ops = ops; - host->of_node = of_node_get(of_node); + domain->revmap_type = revmap_type; + domain->inval_irq = inval_irq; + domain->ops = ops; + domain->of_node = of_node_get(of_node); - if (host->ops->match == NULL) - host->ops->match = default_irq_host_match; + if (domain->ops->match == NULL) + domain->ops->match = default_irq_domain_match; mutex_lock(&irq_domain_mutex); /* Make sure only one legacy controller can be created */ @@ -75,53 +75,53 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node, list_for_each_entry(h, &irq_domain_list, link) { if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { mutex_unlock(&irq_domain_mutex); - of_node_put(host->of_node); - kfree(host); + of_node_put(domain->of_node); + kfree(domain); return NULL; } } } - list_add(&host->link, &irq_domain_list); + list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); /* Additional setups per revmap type */ switch(revmap_type) { case IRQ_DOMAIN_MAP_LEGACY: /* 0 is always the invalid number for legacy */ - host->inval_irq = 0; - /* setup us as the host for all legacy interrupts */ + domain->inval_irq = 0; + /* setup us as the domain for all legacy interrupts */ for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { struct irq_data *irq_data = irq_get_irq_data(i); irq_data->hwirq = i; - irq_data->domain = host; + irq_data->domain = domain; /* Legacy flags are left to default at this point, * one can then use irq_create_mapping() to * explicitly change them */ - ops->map(host, i, i); + ops->map(domain, i, i); /* Clear norequest flags */ irq_clear_status_flags(i, IRQ_NOREQUEST); } break; case IRQ_DOMAIN_MAP_LINEAR: - rmap = (unsigned int *)(host + 1); + rmap = (unsigned int *)(domain + 1); for (i = 0; i < revmap_arg; i++) rmap[i] = 0; - host->revmap_data.linear.size = revmap_arg; - host->revmap_data.linear.revmap = rmap; + domain->revmap_data.linear.size = revmap_arg; + domain->revmap_data.linear.revmap = rmap; break; case IRQ_DOMAIN_MAP_TREE: - INIT_RADIX_TREE(&host->revmap_data.tree, GFP_KERNEL); + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); break; default: break; } - pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host); + pr_debug("irq: Allocated domain of type %d @0x%p\n", revmap_type, domain); - return host; + return domain; } /** @@ -150,18 +150,18 @@ EXPORT_SYMBOL_GPL(irq_find_host); /** * irq_set_default_host() - Set a "default" irq domain - * @host: default host pointer + * @domain: default domain pointer * * For convenience, it's possible to set a "default" domain that will be used * whenever NULL is passed to irq_create_mapping(). It makes life easier for * platforms that want to manipulate a few hard coded interrupt numbers that * aren't properly represented in the device-tree. */ -void irq_set_default_host(struct irq_domain *host) +void irq_set_default_host(struct irq_domain *domain) { - pr_debug("irq: Default host set to @0x%p\n", host); + pr_debug("irq: Default domain set to @0x%p\n", domain); - irq_default_host = host; + irq_default_domain = domain; } /** @@ -180,14 +180,14 @@ void irq_set_virq_count(unsigned int count) irq_virq_count = count; } -static int irq_setup_virq(struct irq_domain *host, unsigned int virq, +static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); irq_data->hwirq = hwirq; - irq_data->domain = host; - if (host->ops->map(host, virq, hwirq)) { + irq_data->domain = domain; + if (domain->ops->map(domain, virq, hwirq)) { pr_debug("irq: -> mapping failed, freeing\n"); irq_data->domain = NULL; irq_data->hwirq = 0; @@ -201,21 +201,21 @@ static int irq_setup_virq(struct irq_domain *host, unsigned int virq, /** * irq_create_direct_mapping() - Allocate an irq for direct mapping - * @host: domain to allocate the irq for or NULL for default host + * @domain: domain to allocate the irq for or NULL for default domain * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use * the linux irq as the hardware interrupt number. */ -unsigned int irq_create_direct_mapping(struct irq_domain *host) +unsigned int irq_create_direct_mapping(struct irq_domain *domain) { unsigned int virq; - if (host == NULL) - host = irq_default_host; + if (domain == NULL) + domain = irq_default_domain; - BUG_ON(host == NULL); - WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP); + BUG_ON(domain == NULL); + WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); virq = irq_alloc_desc_from(1, 0); if (!virq) { @@ -231,7 +231,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host) pr_debug("irq: create_direct obtained virq %d\n", virq); - if (irq_setup_virq(host, virq, virq)) { + if (irq_setup_virq(domain, virq, virq)) { irq_free_desc(virq); return 0; } @@ -241,41 +241,41 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host) /** * irq_create_mapping() - Map a hardware interrupt into linux irq space - * @host: host owning this hardware interrupt or NULL for default host - * @hwirq: hardware irq number in that host space + * @domain: domain owning this hardware interrupt or NULL for default domain + * @hwirq: hardware irq number in that domain space * * Only one mapping per hardware interrupt is permitted. Returns a linux * irq number. * If the sense/trigger is to be specified, set_irq_type() should be called * on the number returned from that call. */ -unsigned int irq_create_mapping(struct irq_domain *host, +unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned int virq, hint; - pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq); + pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); - /* Look for default host if nececssary */ - if (host == NULL) - host = irq_default_host; - if (host == NULL) { + /* Look for default domain if nececssary */ + if (domain == NULL) + domain = irq_default_domain; + if (domain == NULL) { printk(KERN_WARNING "irq_create_mapping called for" - " NULL host, hwirq=%lx\n", hwirq); + " NULL domain, hwirq=%lx\n", hwirq); WARN_ON(1); return 0; } - pr_debug("irq: -> using host @%p\n", host); + pr_debug("irq: -> using domain @%p\n", domain); /* Check if mapping already exists */ - virq = irq_find_mapping(host, hwirq); + virq = irq_find_mapping(domain, hwirq); if (virq) { pr_debug("irq: -> existing mapping on virq %d\n", virq); return virq; } /* Get a virtual interrupt number */ - if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) { + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) { /* Handle legacy */ virq = (unsigned int)hwirq; if (virq == 0 || virq >= NUM_ISA_INTERRUPTS) @@ -295,14 +295,14 @@ unsigned int irq_create_mapping(struct irq_domain *host, } } - if (irq_setup_virq(host, virq, hwirq)) { - if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY) + if (irq_setup_virq(domain, virq, hwirq)) { + if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) irq_free_desc(virq); return 0; } - pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n", - hwirq, host->of_node ? host->of_node->full_name : "null", virq); + pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", + hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); return virq; } @@ -311,32 +311,29 @@ EXPORT_SYMBOL_GPL(irq_create_mapping); unsigned int irq_create_of_mapping(struct device_node *controller, const u32 *intspec, unsigned int intsize) { - struct irq_domain *host; + struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; unsigned int virq; - if (controller == NULL) - host = irq_default_host; - else - host = irq_find_host(controller); - if (host == NULL) { - printk(KERN_WARNING "irq: no irq host found for %s !\n", + domain = controller ? irq_find_host(controller) : irq_default_domain; + if (!domain) { + printk(KERN_WARNING "irq: no irq domain found for %s !\n", controller->full_name); return 0; } - /* If host has no translation, then we assume interrupt line */ - if (host->ops->xlate == NULL) + /* If domain has no translation, then we assume interrupt line */ + if (domain->ops->xlate == NULL) hwirq = intspec[0]; else { - if (host->ops->xlate(host, controller, intspec, intsize, + if (domain->ops->xlate(domain, controller, intspec, intsize, &hwirq, &type)) return 0; } /* Create mapping */ - virq = irq_create_mapping(host, hwirq); + virq = irq_create_mapping(domain, hwirq); if (!virq) return virq; @@ -355,18 +352,18 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping); void irq_dispose_mapping(unsigned int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); - struct irq_domain *host; + struct irq_domain *domain; irq_hw_number_t hwirq; if (!virq || !irq_data) return; - host = irq_data->domain; - if (WARN_ON(host == NULL)) + domain = irq_data->domain; + if (WARN_ON(domain == NULL)) return; /* Never unmap legacy interrupts */ - if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) return; irq_set_status_flags(virq, IRQ_NOREQUEST); @@ -378,26 +375,26 @@ void irq_dispose_mapping(unsigned int virq) synchronize_irq(virq); /* Tell the PIC about it */ - if (host->ops->unmap) - host->ops->unmap(host, virq); + if (domain->ops->unmap) + domain->ops->unmap(domain, virq); smp_mb(); /* Clear reverse map */ hwirq = irq_data->hwirq; - switch(host->revmap_type) { + switch(domain->revmap_type) { case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < host->revmap_data.linear.size) - host->revmap_data.linear.revmap[hwirq] = 0; + if (hwirq < domain->revmap_data.linear.size) + domain->revmap_data.linear.revmap[hwirq] = 0; break; case IRQ_DOMAIN_MAP_TREE: mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&host->revmap_data.tree, hwirq); + radix_tree_delete(&domain->revmap_data.tree, hwirq); mutex_unlock(&revmap_trees_mutex); break; } /* Destroy map */ - irq_data->hwirq = host->inval_irq; + irq_data->hwirq = domain->inval_irq; irq_free_desc(virq); } @@ -405,27 +402,27 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping); /** * irq_find_mapping() - Find a linux irq from an hw irq number. - * @host: domain owning this hardware interrupt - * @hwirq: hardware irq number in that host space + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * * This is a slow path, for use by generic code. It's expected that an * irq controller implementation directly calls the appropriate low level * mapping function. */ -unsigned int irq_find_mapping(struct irq_domain *host, +unsigned int irq_find_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned int i; unsigned int hint = hwirq % irq_virq_count; - /* Look for default host if nececssary */ - if (host == NULL) - host = irq_default_host; - if (host == NULL) + /* Look for default domain if nececssary */ + if (domain == NULL) + domain = irq_default_domain; + if (domain == NULL) return 0; /* legacy -> bail early */ - if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) return hwirq; /* Slow path does a linear search of the map */ @@ -434,7 +431,7 @@ unsigned int irq_find_mapping(struct irq_domain *host, i = hint; do { struct irq_data *data = irq_get_irq_data(i); - if (data && (data->domain == host) && (data->hwirq == hwirq)) + if (data && (data->domain == domain) && (data->hwirq == hwirq)) return i; i++; if (i >= irq_virq_count) @@ -446,26 +443,26 @@ EXPORT_SYMBOL_GPL(irq_find_mapping); /** * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. - * @host: host owning this hardware interrupt - * @hwirq: hardware irq number in that host space + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * * This is a fast path, for use by irq controller code that uses radix tree * revmaps */ -unsigned int irq_radix_revmap_lookup(struct irq_domain *host, +unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, irq_hw_number_t hwirq) { struct irq_data *irq_data; - if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) - return irq_find_mapping(host, hwirq); + if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return irq_find_mapping(domain, hwirq); /* * Freeing an irq can delete nodes along the path to * do the lookup via call_rcu. */ rcu_read_lock(); - irq_data = radix_tree_lookup(&host->revmap_data.tree, hwirq); + irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); rcu_read_unlock(); /* @@ -473,62 +470,62 @@ unsigned int irq_radix_revmap_lookup(struct irq_domain *host, * Else fallback to linear lookup - this should not happen in practice * as it means that we failed to insert the node in the radix tree. */ - return irq_data ? irq_data->irq : irq_find_mapping(host, hwirq); + return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); } /** * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. - * @host: host owning this hardware interrupt + * @domain: domain owning this hardware interrupt * @virq: linux irq number - * @hwirq: hardware irq number in that host space + * @hwirq: hardware irq number in that domain space * * This is for use by irq controllers that use a radix tree reverse * mapping for fast lookup. */ -void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq, +void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); - if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) + if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) return; if (virq) { mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data); + radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); } } /** * irq_linear_revmap() - Find a linux irq from a hw irq number. - * @host: host owning this hardware interrupt - * @hwirq: hardware irq number in that host space + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * * This is a fast path, for use by irq controller code that uses linear * revmaps. It does fallback to the slow path if the revmap doesn't exist * yet and will create the revmap entry with appropriate locking */ -unsigned int irq_linear_revmap(struct irq_domain *host, +unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned int *revmap; - if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) - return irq_find_mapping(host, hwirq); + if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) + return irq_find_mapping(domain, hwirq); /* Check revmap bounds */ - if (unlikely(hwirq >= host->revmap_data.linear.size)) - return irq_find_mapping(host, hwirq); + if (unlikely(hwirq >= domain->revmap_data.linear.size)) + return irq_find_mapping(domain, hwirq); /* Check if revmap was allocated */ - revmap = host->revmap_data.linear.revmap; + revmap = domain->revmap_data.linear.revmap; if (unlikely(revmap == NULL)) - return irq_find_mapping(host, hwirq); + return irq_find_mapping(domain, hwirq); /* Fill up revmap with slow path if no mapping found */ if (unlikely(!revmap[hwirq])) - revmap[hwirq] = irq_find_mapping(host, hwirq); + revmap[hwirq] = irq_find_mapping(domain, hwirq); return revmap[hwirq]; } @@ -544,7 +541,7 @@ static int virq_debug_show(struct seq_file *m, void *private) int i; seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", - "chip name", "chip data", "host name"); + "chip name", "chip data", "domain name"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); -- cgit From a8db8cf0d894df5f1dcfd4bce9894e0dbcc01c96 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:54 -0700 Subject: irq_domain: Replace irq_alloc_host() with revmap-specific initializers Each revmap type has different arguments for setting up the revmap. This patch splits up the generator functions so that each revmap type can do its own setup and the user doesn't need to keep track of how each revmap type handles the arguments. This patch also adds a host_data argument to the generators. There are cases where the host_data pointer will be needed before the function returns. ie. the legacy map calls the .map callback for each irq before returning. v2: - Add void *host_data argument to irq_domain_add_*() functions - fixed failure to compile - Moved IRQ_DOMAIN_MAP_* defines into irqdomain.c Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 200 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 130 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 432d292b33f8..acedba1a2651 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -13,6 +13,11 @@ #include #include +#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */ +#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ +#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ +#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ + static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); @@ -27,100 +32,158 @@ static int default_irq_domain_match(struct irq_domain *d, struct device_node *np } /** - * irq_alloc_host() - Allocate a new irq_domain data structure + * irq_domain_alloc() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller * @revmap_type: type of reverse mapping to use - * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map * @ops: map/unmap domain callbacks - * @inval_irq: provide a hw number in that domain space that is always invalid + * @host_data: Controller private data pointer * - * Allocates and initialize and irq_domain structure. Note that in the case of - * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns - * for all legacy interrupts except 0 (which is always the invalid irq for - * a legacy controller). For a IRQ_DOMAIN_MAP_LINEAR, the map is allocated by - * this call as well. For a IRQ_DOMAIN_MAP_TREE, the radix tree will be - * allocated later during boot automatically (the reverse mapping will use the - * slow path until that happens). + * Allocates and initialize and irq_domain structure. Caller is expected to + * register allocated irq_domain with irq_domain_register(). Returns pointer + * to IRQ domain, or NULL on failure. */ -struct irq_domain *irq_alloc_host(struct device_node *of_node, - unsigned int revmap_type, - unsigned int revmap_arg, - struct irq_domain_ops *ops, - irq_hw_number_t inval_irq) +static struct irq_domain *irq_domain_alloc(struct device_node *of_node, + unsigned int revmap_type, + struct irq_domain_ops *ops, + void *host_data) { - struct irq_domain *domain, *h; - unsigned int size = sizeof(struct irq_domain); - unsigned int i; - unsigned int *rmap; + struct irq_domain *domain; - /* Allocate structure and revmap table if using linear mapping */ - if (revmap_type == IRQ_DOMAIN_MAP_LINEAR) - size += revmap_arg * sizeof(unsigned int); - domain = kzalloc(size, GFP_KERNEL); - if (domain == NULL) + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (WARN_ON(!domain)) return NULL; /* Fill structure */ domain->revmap_type = revmap_type; - domain->inval_irq = inval_irq; domain->ops = ops; + domain->host_data = host_data; domain->of_node = of_node_get(of_node); if (domain->ops->match == NULL) domain->ops->match = default_irq_domain_match; + return domain; +} + +static void irq_domain_add(struct irq_domain *domain) +{ + mutex_lock(&irq_domain_mutex); + list_add(&domain->link, &irq_domain_list); + mutex_unlock(&irq_domain_mutex); + pr_debug("irq: Allocated domain of type %d @0x%p\n", + domain->revmap_type, domain); +} + +/** + * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + * + * Note: the map() callback will be called before this function returns + * for all legacy interrupts except 0 (which is always the invalid irq for + * a legacy controller). + */ +struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain, *h; + unsigned int i; + + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); + if (!domain) + return NULL; + mutex_lock(&irq_domain_mutex); /* Make sure only one legacy controller can be created */ - if (revmap_type == IRQ_DOMAIN_MAP_LEGACY) { - list_for_each_entry(h, &irq_domain_list, link) { - if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { - mutex_unlock(&irq_domain_mutex); - of_node_put(domain->of_node); - kfree(domain); - return NULL; - } + list_for_each_entry(h, &irq_domain_list, link) { + if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { + mutex_unlock(&irq_domain_mutex); + of_node_put(domain->of_node); + kfree(domain); + return NULL; } } list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); - /* Additional setups per revmap type */ - switch(revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - /* 0 is always the invalid number for legacy */ - domain->inval_irq = 0; - /* setup us as the domain for all legacy interrupts */ - for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { - struct irq_data *irq_data = irq_get_irq_data(i); - irq_data->hwirq = i; - irq_data->domain = domain; - - /* Legacy flags are left to default at this point, - * one can then use irq_create_mapping() to - * explicitly change them - */ - ops->map(domain, i, i); - - /* Clear norequest flags */ - irq_clear_status_flags(i, IRQ_NOREQUEST); - } - break; - case IRQ_DOMAIN_MAP_LINEAR: - rmap = (unsigned int *)(domain + 1); - for (i = 0; i < revmap_arg; i++) - rmap[i] = 0; - domain->revmap_data.linear.size = revmap_arg; - domain->revmap_data.linear.revmap = rmap; - break; - case IRQ_DOMAIN_MAP_TREE: - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - break; - default: - break; + /* setup us as the domain for all legacy interrupts */ + for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { + struct irq_data *irq_data = irq_get_irq_data(i); + irq_data->hwirq = i; + irq_data->domain = domain; + + /* Legacy flags are left to default at this point, + * one can then use irq_create_mapping() to + * explicitly change them + */ + ops->map(domain, i, i); + + /* Clear norequest flags */ + irq_clear_status_flags(i, IRQ_NOREQUEST); } + return domain; +} - pr_debug("irq: Allocated domain of type %d @0x%p\n", revmap_type, domain); +/** + * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + */ +struct irq_domain *irq_domain_add_linear(struct device_node *of_node, + unsigned int size, + struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + unsigned int *revmap; + + revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); + if (WARN_ON(!revmap)) + return NULL; + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); + if (!domain) { + kfree(revmap); + return NULL; + } + domain->revmap_data.linear.size = size; + domain->revmap_data.linear.revmap = revmap; + irq_domain_add(domain); + return domain; +} + +struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain = irq_domain_alloc(of_node, + IRQ_DOMAIN_MAP_NOMAP, ops, host_data); + if (domain) + irq_domain_add(domain); + return domain; +} + +/** + * irq_domain_add_tree() + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * + * Note: The radix tree will be allocated later during boot automatically + * (the reverse mapping will use the slow path until that happens). + */ +struct irq_domain *irq_domain_add_tree(struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain = irq_domain_alloc(of_node, + IRQ_DOMAIN_MAP_TREE, ops, host_data); + if (domain) { + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); + irq_domain_add(domain); + } return domain; } @@ -393,9 +456,6 @@ void irq_dispose_mapping(unsigned int virq) break; } - /* Destroy map */ - irq_data->hwirq = domain->inval_irq; - irq_free_desc(virq); } EXPORT_SYMBOL_GPL(irq_dispose_mapping); -- cgit From 1bc04f2cf8c2a1feadbd994f50c40bb145bf2989 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:55 -0700 Subject: irq_domain: Add support for base irq and hwirq in legacy mappings Add support for a legacy mapping where irq = (hwirq - first_hwirq + first_irq) so that a controller driver can allocate a fixed range of irq_descs and use a simple calculation to translate back and forth between linux and hw irq numbers. This is needed to use an irq_domain with many of the ARM interrupt controller drivers that manage their own irq_desc allocations. Ultimately the goal is to migrate those drivers to use the linear revmap, but doing it this way allows each driver to be converted separately which makes the migration path easier. This patch generalizes the IRQ_DOMAIN_MAP_LEGACY method to use (first_irq-first_hwirq) as the offset between hwirq and linux irq number, and adds checks to make sure that the hwirq number does not exceed range assigned to the controller. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 96 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index acedba1a2651..c6740d72073e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -13,7 +13,8 @@ #include #include -#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */ +#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. + * ie. legacy 8259, gets irqs 1..15 */ #define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ #define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ #define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ @@ -74,9 +75,25 @@ static void irq_domain_add(struct irq_domain *domain) domain->revmap_type, domain); } +static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; + int size = domain->revmap_data.legacy.size; + + if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) + return 0; + return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; +} + /** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. * @of_node: pointer to interrupt controller's device tree node. + * @size: total number of irqs in legacy mapping + * @first_irq: first number of irq block assigned to the domain + * @first_hwirq: first hwirq number to use for the translation. Should normally + * be '0', but a positive integer can be used if the effective + * hwirqs numbering does not begin at zero. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -85,44 +102,64 @@ static void irq_domain_add(struct irq_domain *domain) * a legacy controller). */ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, + unsigned int size, + unsigned int first_irq, + irq_hw_number_t first_hwirq, struct irq_domain_ops *ops, void *host_data) { - struct irq_domain *domain, *h; + struct irq_domain *domain; unsigned int i; domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); if (!domain) return NULL; + domain->revmap_data.legacy.first_irq = first_irq; + domain->revmap_data.legacy.first_hwirq = first_hwirq; + domain->revmap_data.legacy.size = size; + mutex_lock(&irq_domain_mutex); - /* Make sure only one legacy controller can be created */ - list_for_each_entry(h, &irq_domain_list, link) { - if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { + /* Verify that all the irqs are available */ + for (i = 0; i < size; i++) { + int irq = first_irq + i; + struct irq_data *irq_data = irq_get_irq_data(irq); + + if (WARN_ON(!irq_data || irq_data->domain)) { mutex_unlock(&irq_domain_mutex); of_node_put(domain->of_node); kfree(domain); return NULL; } } - list_add(&domain->link, &irq_domain_list); - mutex_unlock(&irq_domain_mutex); - /* setup us as the domain for all legacy interrupts */ - for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { - struct irq_data *irq_data = irq_get_irq_data(i); - irq_data->hwirq = i; + /* Claim all of the irqs before registering a legacy domain */ + for (i = 0; i < size; i++) { + struct irq_data *irq_data = irq_get_irq_data(first_irq + i); + irq_data->hwirq = first_hwirq + i; irq_data->domain = domain; + } + mutex_unlock(&irq_domain_mutex); + + for (i = 0; i < size; i++) { + int irq = first_irq + i; + int hwirq = first_hwirq + i; + + /* IRQ0 gets ignored */ + if (!irq) + continue; /* Legacy flags are left to default at this point, * one can then use irq_create_mapping() to * explicitly change them */ - ops->map(domain, i, i); + ops->map(domain, irq, hwirq); /* Clear norequest flags */ - irq_clear_status_flags(i, IRQ_NOREQUEST); + irq_clear_status_flags(irq, IRQ_NOREQUEST); } + + irq_domain_add(domain); return domain; } @@ -338,24 +375,19 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Get a virtual interrupt number */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) { - /* Handle legacy */ - virq = (unsigned int)hwirq; - if (virq == 0 || virq >= NUM_ISA_INTERRUPTS) - return 0; - return virq; - } else { - /* Allocate a virtual interrupt number */ - hint = hwirq % irq_virq_count; - if (hint == 0) - hint++; - virq = irq_alloc_desc_from(hint, 0); - if (!virq) - virq = irq_alloc_desc_from(1, 0); - if (!virq) { - pr_debug("irq: -> virq allocation failed\n"); - return 0; - } + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return irq_domain_legacy_revmap(domain, hwirq); + + /* Allocate a virtual interrupt number */ + hint = hwirq % irq_virq_count; + if (hint == 0) + hint++; + virq = irq_alloc_desc_from(hint, 0); + if (!virq) + virq = irq_alloc_desc_from(1, 0); + if (!virq) { + pr_debug("irq: -> virq allocation failed\n"); + return 0; } if (irq_setup_virq(domain, virq, hwirq)) { @@ -483,7 +515,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, /* legacy -> bail early */ if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return hwirq; + return irq_domain_legacy_revmap(domain, hwirq); /* Slow path does a linear search of the map */ if (hint == 0) -- cgit From 75294957be1dee7d22dd7d90bd31334ba410e836 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:57 -0700 Subject: irq_domain: Remove 'new' irq_domain in favour of the ppc one This patch removes the simplistic implementation of irq_domains and enables the powerpc infrastructure for all irq_domain users. The powerpc infrastructure includes support for complex mappings between Linux and hardware irq numbers, and can manage allocation of irq_descs. This patch also converts the few users of irq_domain_add()/irq_domain_del() to call irq_domain_add_legacy() instead. v3: Fix bug that set up too many irqs in translation range. v2: Fix removal of irq_alloc_descs() call in gic driver Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 159 ++++--------------------------------------------- 1 file changed, 13 insertions(+), 146 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c6740d72073e..2981ebfeb40c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -22,7 +22,6 @@ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); -#ifdef CONFIG_PPC static DEFINE_MUTEX(revmap_trees_mutex); static unsigned int irq_virq_count = NR_IRQS; static struct irq_domain *irq_default_domain; @@ -694,124 +693,11 @@ static int __init irq_debugfs_init(void) __initcall(irq_debugfs_init); #endif /* CONFIG_VIRQ_DEBUG */ -#else /* CONFIG_PPC */ - -/** - * irq_domain_add() - Register an irq_domain - * @domain: ptr to initialized irq_domain structure - * - * Registers an irq_domain structure. The irq_domain must at a minimum be - * initialized with an ops structure pointer, and either a ->to_irq hook or - * a valid irq_base value. Everything else is optional. - */ -void irq_domain_add(struct irq_domain *domain) -{ - struct irq_data *d; - int hwirq, irq; - - /* - * This assumes that the irq_domain owner has already allocated - * the irq_descs. This block will be removed when support for dynamic - * allocation of irq_descs is added to irq_domain. - */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - if (!d) { - WARN(1, "error: assigning domain to non existant irq_desc"); - return; - } - if (d->domain) { - /* things are broken; just report, don't clean up */ - WARN(1, "error: irq_desc already assigned to a domain"); - return; - } - d->domain = domain; - d->hwirq = hwirq; - } - - mutex_lock(&irq_domain_mutex); - list_add(&domain->link, &irq_domain_list); - mutex_unlock(&irq_domain_mutex); -} - -/** - * irq_domain_del() - Unregister an irq_domain - * @domain: ptr to registered irq_domain. - */ -void irq_domain_del(struct irq_domain *domain) -{ - struct irq_data *d; - int hwirq, irq; - - mutex_lock(&irq_domain_mutex); - list_del(&domain->link); - mutex_unlock(&irq_domain_mutex); - - /* Clear the irq_domain assignments */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - d->domain = NULL; - } -} - -#if defined(CONFIG_OF_IRQ) -/** - * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec - * - * Used by the device tree interrupt mapping code to translate a device tree - * interrupt specifier to a valid linux irq number. Returns either a valid - * linux IRQ number or 0. - * - * When the caller no longer need the irq number returned by this function it - * should arrange to call irq_dispose_mapping(). - */ -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - struct irq_domain *domain; - unsigned long hwirq; - unsigned int irq, type; - int rc = -EINVAL; - - /* Find a domain which can translate the irq spec */ - mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, link) { - if (!domain->ops->xlate) - continue; - rc = domain->ops->xlate(domain, controller, - intspec, intsize, &hwirq, &type); - if (rc == 0) - break; - } - mutex_unlock(&irq_domain_mutex); - - if (rc != 0) - return 0; - - irq = irq_domain_to_irq(domain, hwirq); - if (type != IRQ_TYPE_NONE) - irq_set_irq_type(irq, type); - pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", - controller->full_name, (int)hwirq, irq, type); - return irq; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); - -/** - * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() - * @irq: linux irq number to be discarded - * - * Calling this function indicates the caller no longer needs a reference to - * the linux irq number returned by a prior call to irq_create_of_mapping(). - */ -void irq_dispose_mapping(unsigned int irq) +int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hwirq) { - /* - * nothing yet; will be filled when support for dynamic allocation of - * irq_descs is added to irq_domain - */ + return 0; } -EXPORT_SYMBOL_GPL(irq_dispose_mapping); int irq_domain_simple_xlate(struct irq_domain *d, struct device_node *controller, @@ -822,10 +708,6 @@ int irq_domain_simple_xlate(struct irq_domain *d, return -EINVAL; if (intsize < 1) return -EINVAL; - if (d->nr_irq && ((intspec[0] < d->hwirq_base) || - (intspec[0] >= d->hwirq_base + d->nr_irq))) - return -EINVAL; - *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; if (intsize > 1) @@ -833,23 +715,17 @@ int irq_domain_simple_xlate(struct irq_domain *d, return 0; } -/** - * irq_domain_create_simple() - Set up a 'simple' translation range - */ +struct irq_domain_ops irq_domain_simple_ops = { + .map = irq_domain_simple_map, + .xlate = irq_domain_simple_xlate, +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +#ifdef CONFIG_OF_IRQ void irq_domain_add_simple(struct device_node *controller, int irq_base) { - struct irq_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) { - WARN_ON(1); - return; - } - - domain->irq_base = irq_base; - domain->of_node = of_node_get(controller); - domain->ops = &irq_domain_simple_ops; - irq_domain_add(domain); + irq_domain_add_legacy(controller, 32, irq_base, 0, + &irq_domain_simple_ops, NULL); } EXPORT_SYMBOL_GPL(irq_domain_add_simple); @@ -864,13 +740,4 @@ void irq_domain_generate_simple(const struct of_device_id *match, irq_domain_add_simple(node, irq_start); } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif /* CONFIG_OF_IRQ */ - -struct irq_domain_ops irq_domain_simple_ops = { -#ifdef CONFIG_OF_IRQ - .xlate = irq_domain_simple_xlate, -#endif /* CONFIG_OF_IRQ */ -}; -EXPORT_SYMBOL_GPL(irq_domain_simple_ops); - -#endif /* !CONFIG_PPC */ +#endif -- cgit From 6b783f7c5dde2648fa0bbe7fc8ac80d78699e67f Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 10 Jan 2012 17:09:30 -0700 Subject: irq_domain: Remove irq_domain_add_simple() irq_domain_add_simple() was a stop-gap measure until complete irq_domain support was complete. This patch removes the irq_domain_add_simple() interface. This patch also drops the explicit irq_domain initialization performed by the mach-versatile code because the versatile interrupt controller already has irq_domain support built into it. This was a bug that was hanging around quietly for a while, but with the full irq_domain which actually verifies that irq_domain ranges are available it would cause the registration to fail and the system wouldn't boot. v4: Fixed number of irqs in mx5 gpio code v2: Updated to pass in host_data pointer on irq_domain allocation. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Thomas Gleixner Cc: Milton Miller Cc: Russell King Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 2981ebfeb40c..6328d9350f04 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -722,13 +722,6 @@ struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); #ifdef CONFIG_OF_IRQ -void irq_domain_add_simple(struct device_node *controller, int irq_base) -{ - irq_domain_add_legacy(controller, 32, irq_base, 0, - &irq_domain_simple_ops, NULL); -} -EXPORT_SYMBOL_GPL(irq_domain_add_simple); - void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start) { @@ -737,7 +730,8 @@ void irq_domain_generate_simple(const struct of_device_id *match, (unsigned long long) phys_base, (int) irq_start); node = of_find_matching_node_by_address(NULL, match, phys_base); if (node) - irq_domain_add_simple(node, irq_start); + irq_domain_add_legacy(node, 32, irq_start, 0, + &irq_domain_simple_ops, NULL); } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); #endif -- cgit From 16b2e6e2f31dda41f114aa0acade04f7e10f67c9 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 26 Jan 2012 11:26:52 -0700 Subject: irq_domain: Create common xlate functions that device drivers can use Rather than having each interrupt controller driver creating its own barely unique .xlate function for irq_domain, create a library of translators which any driver can use directly. v5: - Remove irq_domain_xlate_pci(). It was incorrect. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Mark Salter Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 65 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6328d9350f04..456e3fc8387f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -699,25 +699,70 @@ int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, return 0; } -int irq_domain_simple_xlate(struct irq_domain *d, - struct device_node *controller, - const u32 *intspec, unsigned int intsize, - unsigned long *out_hwirq, unsigned int *out_type) +/** + * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings + * + * Device Tree IRQ specifier translation function which works with one cell + * bindings where the cell value maps directly to the hwirq number. + */ +int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) { - if (d->of_node != controller) - return -EINVAL; - if (intsize < 1) + if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; - if (intsize > 1) - *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; return 0; } +EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell); + +/** + * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings + * + * Device Tree IRQ specifier translation function which works with two cell + * bindings where the cell values map directly to the hwirq number + * and linux irq flags. + */ +int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) +{ + if (WARN_ON(intsize < 2)) + return -EINVAL; + *out_hwirq = intspec[0]; + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); + +/** + * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings + * + * Device Tree IRQ specifier translation function which works with either one + * or two cell bindings where the cell values map directly to the hwirq number + * and linux irq flags. + * + * Note: don't use this function unless your interrupt controller explicitly + * supports both one and two cell bindings. For the majority of controllers + * the _onecell() or _twocell() variants above should be used. + */ +int irq_domain_xlate_onetwocell(struct irq_domain *d, + struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (WARN_ON(intsize < 1)) + return -EINVAL; + *out_hwirq = intspec[0]; + *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); struct irq_domain_ops irq_domain_simple_ops = { .map = irq_domain_simple_map, - .xlate = irq_domain_simple_xlate, + .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -- cgit From a18dc81bf58258ac0920bec26b91656cb0140d2a Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 26 Jan 2012 12:12:14 -0700 Subject: irq_domain: constify irq_domain_ops Make irq_domain_ops pointer a constant to make it safer for multiple instances to share the same ops pointer and change the irq_domain code so that it does not modify the ops. v4: Fix mismatched type reference in powerpc code Signed-off-by: Grant Likely Cc: Thomas Gleixner Cc: Benjamin Herrenschmidt Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 456e3fc8387f..25a498eb98a3 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -26,11 +26,6 @@ static DEFINE_MUTEX(revmap_trees_mutex); static unsigned int irq_virq_count = NR_IRQS; static struct irq_domain *irq_default_domain; -static int default_irq_domain_match(struct irq_domain *d, struct device_node *np) -{ - return d->of_node != NULL && d->of_node == np; -} - /** * irq_domain_alloc() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller @@ -44,7 +39,7 @@ static int default_irq_domain_match(struct irq_domain *d, struct device_node *np */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, unsigned int revmap_type, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; @@ -59,9 +54,6 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, domain->host_data = host_data; domain->of_node = of_node_get(of_node); - if (domain->ops->match == NULL) - domain->ops->match = default_irq_domain_match; - return domain; } @@ -104,7 +96,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, unsigned int size, unsigned int first_irq, irq_hw_number_t first_hwirq, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; @@ -170,7 +162,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, */ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, unsigned int size, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; @@ -192,7 +184,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, } struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, @@ -211,7 +203,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, * (the reverse mapping will use the slow path until that happens). */ struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, @@ -230,6 +222,7 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node, struct irq_domain *irq_find_host(struct device_node *node) { struct irq_domain *h, *found = NULL; + int rc; /* We might want to match the legacy controller last since * it might potentially be set to match all interrupts in @@ -237,11 +230,17 @@ struct irq_domain *irq_find_host(struct device_node *node) * yet though... */ mutex_lock(&irq_domain_mutex); - list_for_each_entry(h, &irq_domain_list, link) - if (h->ops->match(h, node)) { + list_for_each_entry(h, &irq_domain_list, link) { + if (h->ops->match) + rc = h->ops->match(h, node); + else + rc = (h->of_node != NULL) && (h->of_node == node); + + if (rc) { found = h; break; } + } mutex_unlock(&irq_domain_mutex); return found; } @@ -760,7 +759,7 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, } EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); -struct irq_domain_ops irq_domain_simple_ops = { +const struct irq_domain_ops irq_domain_simple_ops = { .map = irq_domain_simple_map, .xlate = irq_domain_xlate_onetwocell, }; -- cgit From 55ae451918ec62e553f11b6118fec157f90c31c3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Feb 2012 16:29:14 +0100 Subject: PM / Sleep: Unify kerneldoc comments in kernel/power/suspend.c The kerneldoc comments in kernel/power/suspend.c are not formatted in the same way and the quality of some of them is questionable. Unify the formatting and improve the contents. Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat --- kernel/power/suspend.c | 56 ++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 03bc92b42750..e6b5ef958603 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = { static const struct platform_suspend_ops *suspend_ops; /** - * suspend_set_ops - Set the global suspend method table. - * @ops: Pointer to ops structure. + * suspend_set_ops - Set the global suspend method table. + * @ops: Suspend operations to use. */ void suspend_set_ops(const struct platform_suspend_ops *ops) { @@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state) } /** - * suspend_valid_only_mem - generic memory-only valid callback + * suspend_valid_only_mem - Generic memory-only valid callback. * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. + * Platform drivers that implement mem suspend only and only need to check for + * that in their .valid() callback can use this instead of rolling their own + * .valid() callback. */ int suspend_valid_only_mem(suspend_state_t state) { @@ -83,10 +83,11 @@ static int suspend_test(int level) } /** - * suspend_prepare - Do prep work before entering low-power state. + * suspend_prepare - Prepare for entering system sleep state. * - * This is common code that is called for each state that we're entering. - * Run suspend notifiers, allocate a console and stop all processes. + * Common code run for every system sleep state that can be entered (except for + * hibernation). Run suspend notifiers, allocate the "suspend" console and + * freeze processes. */ static int suspend_prepare(void) { @@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) } /** - * suspend_enter - enter the desired system sleep state. - * @state: State to enter - * @wakeup: Returns information that suspend should not be entered again. + * suspend_enter - Make the system enter the given sleep state. + * @state: System sleep state to enter. + * @wakeup: Returns information that the sleep state should not be re-entered. * * This function should be called after devices have been suspended. */ @@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) } /** - * suspend_devices_and_enter - suspend devices and enter the desired system - * sleep state. - * @state: state to enter + * suspend_devices_and_enter - Suspend devices and enter system sleep state. + * @state: System sleep state to enter. */ int suspend_devices_and_enter(suspend_state_t state) { @@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state) } /** - * suspend_finish - Do final work before exiting suspend sequence. + * suspend_finish - Clean up before finishing the suspend sequence. * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. + * Call platform code to clean up, restart processes, and free the console that + * we've allocated. This routine is not called for hibernation. */ static void suspend_finish(void) { @@ -265,14 +265,12 @@ static void suspend_finish(void) } /** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. + * enter_state - Do common work needed to enter system sleep state. + * @state: System sleep state to enter. * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). + * Make sure that no one else is trying to put the system into a sleep state. + * Fail if that's not the case. Otherwise, prepare for system suspend, make the + * system enter the given sleep state and clean up after wakeup. */ int enter_state(suspend_state_t state) { @@ -310,11 +308,11 @@ int enter_state(suspend_state_t state) } /** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumerated value of state to enter. + * pm_suspend - Externally visible function for suspending the system. + * @state: System sleep state to enter. * - * Determine whether or not value is within range, get state - * structure, and enter (above). + * Check if the value of @state represents one of the supported states, + * execute enter_state() and update system suspend statistics. */ int pm_suspend(suspend_state_t state) { -- cgit From 93e1ee43a72b11e1b50aab87046c131a836a4456 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Feb 2012 16:29:24 +0100 Subject: PM / Sleep: Make enter_state() in kernel/power/suspend.c static The enter_state() function in kernel/power/suspend.c should be static and state_store() in kernel/power/suspend.c should call pm_suspend() instead of it, so make that happen (which also reduces code duplication related to suspend statistics). Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat --- kernel/power/main.c | 8 +++----- kernel/power/power.h | 2 -- kernel/power/suspend.c | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index b1e324878d5f..1c12581f1c62 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -291,12 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { + error = pm_suspend(state); break; - } - if (state < PM_SUSPEND_MAX && *s) { - error = enter_state(state); - suspend_stats_update(error); + } } #endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 398d42b48e9e..98f3622d7407 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -177,13 +177,11 @@ extern const char *const pm_states[]; extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); -extern int enter_state(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } -static inline int enter_state(suspend_state_t state) { return -ENOSYS; } static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index e6b5ef958603..4914358a0543 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -272,7 +272,7 @@ static void suspend_finish(void) * Fail if that's not the case. Otherwise, prepare for system suspend, make the * system enter the given sleep state and clean up after wakeup. */ -int enter_state(suspend_state_t state) +static int enter_state(suspend_state_t state) { int error; -- cgit From bc25cf508942c56810d4fb623ef27b56ccef7783 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Feb 2012 16:29:33 +0100 Subject: PM / Sleep: Drop suspend_stats_update() Since suspend_stats_update() is only called from pm_suspend(), move its code directly into that function and remove the static inline definition from include/linux/suspend.h. Clean_up pm_suspend() in the process. Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat --- kernel/power/suspend.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4914358a0543..88e5c967370d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -316,12 +316,18 @@ static int enter_state(suspend_state_t state) */ int pm_suspend(suspend_state_t state) { - int ret; - if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { - ret = enter_state(state); - suspend_stats_update(ret); - return ret; + int error; + + if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) + return -EINVAL; + + error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else { + suspend_stats.success++; } - return -EINVAL; + return error; } EXPORT_SYMBOL(pm_suspend); -- cgit From 69f1d475cc80c55121852b3030873cdd407fd31c Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 14 Feb 2012 22:20:52 +0100 Subject: PM / Hibernate: print physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. Signed-off-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 6a768e537001..8e2e7461375f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm) list_for_each_entry(region, &nosave_regions, list) { unsigned long pfn; - pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", - region->start_pfn << PAGE_SHIFT, - region->end_pfn << PAGE_SHIFT); + pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) if (pfn_valid(pfn)) { -- cgit From 9a4b430451bb6d8d6b7dcdfbee0e1330b7c475a6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Feb 2012 03:37:26 +0100 Subject: cgroup: Remove wrong comment on cgroup_enable_task_cg_list() Remove the stale comment about RCU protection. Many callers (all of them?) of cgroup_enable_task_cg_list() don't seem to be in an RCU read side critical section. Besides, RCU is not helpful to protect against while_each_thread(). Signed-off-by: Frederic Weisbecker Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: Mandeep Singh Baines Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul E. McKenney --- kernel/cgroup.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 865d89a580c7..6e4eb4312571 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2701,9 +2701,6 @@ static void cgroup_advance_iter(struct cgroup *cgrp, * using their cgroups capability, we don't maintain the lists running * through each css_set to its tasks until we see the list actually * used - in other words after the first call to cgroup_iter_start(). - * - * The tasklist_lock is not held here, as do_each_thread() and - * while_each_thread() are protected by RCU. */ static void cgroup_enable_task_cg_lists(void) { -- cgit From 3ce3230a0cff484e5130153f244d4fb8a56b3a8b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Feb 2012 03:37:27 +0100 Subject: cgroup: Walk task list under tasklist_lock in cgroup_enable_task_cg_list Walking through the tasklist in cgroup_enable_task_cg_list() inside an RCU read side critical section is not enough because: - RCU is not (yet) safe against while_each_thread() - If we use only RCU, a forking task that has passed cgroup_post_fork() without seeing use_task_css_set_links == 1 is not guaranteed to have its child immediately visible in the tasklist if we walk through it remotely with RCU. In this case it will be missing in its css_set's task list. Thus we need to traverse the list (unfortunately) under the tasklist_lock. It makes us safe against while_each_thread() and also make sure we see all forked task that have been added to the tasklist. As a secondary effect, reading and writing use_task_css_set_links are now well ordered against tasklist traversing and modification. The new layout is: CPU 0 CPU 1 use_task_css_set_links = 1 write_lock(tasklist_lock) read_lock(tasklist_lock) add task to tasklist do_each_thread() { write_unlock(tasklist_lock) add thread to css set links if (use_task_css_set_links) } while_each_thread() add thread to css set links read_unlock(tasklist_lock) If CPU 0 traverse the list after the task has been added to the tasklist then it is correctly added to the css set links. OTOH if CPU 0 traverse the tasklist before the new task had the opportunity to be added to the tasklist because it was too early in the fork process, then CPU 1 catches up and add the task to the css set links after it added the task to the tasklist. The right value of use_task_css_set_links is guaranteed to be visible from CPU 1 due to the LOCK/UNLOCK implicit barrier properties: the read_unlock on CPU 0 makes the write on use_task_css_set_links happening and the write_lock on CPU 1 make the read of use_task_css_set_links that comes afterward to return the correct value. Signed-off-by: Frederic Weisbecker Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: Mandeep Singh Baines Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul E. McKenney --- kernel/cgroup.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6e4eb4312571..c6877fe9a831 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2707,6 +2707,14 @@ static void cgroup_enable_task_cg_lists(void) struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); /* @@ -2718,6 +2726,7 @@ static void cgroup_enable_task_cg_lists(void) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); } while_each_thread(g, p); + read_unlock(&tasklist_lock); write_unlock(&css_set_lock); } @@ -4522,6 +4531,17 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_post_fork(struct task_struct *child) { + /* + * use_task_css_set_links is set to 1 before we walk the tasklist + * under the tasklist_lock and we read it here after we added the child + * to the tasklist under the tasklist_lock as well. If the child wasn't + * yet in the tasklist when we walked through it from + * cgroup_enable_task_cg_lists(), then use_task_css_set_links value + * should be visible now due to the paired locking and barriers implied + * by LOCK/UNLOCK: it is written before the tasklist_lock unlock + * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock + * lock on fork. + */ if (use_task_css_set_links) { write_lock(&css_set_lock); if (list_empty(&child->cg_list)) { -- cgit From abd2363f6a5f1030b935e0bdc15cf917313b3b10 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Fri, 24 Feb 2012 08:07:06 -0700 Subject: irq_domain/mips: Allow irq_domain on MIPS This patch makes IRQ_DOMAIN usable on MIPS. It uses an ugly workaround to preserve current behaviour so that MIPS has time to add irq_domain registration to the irq controller drivers. The workaround will be removed in Linux v3.6 Signed-off-by: Grant Likely Cc: Ralf Baechle Cc: Rob Herring Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org --- kernel/irq/irqdomain.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 25a498eb98a3..af48e59bc2ff 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -411,6 +411,18 @@ unsigned int irq_create_of_mapping(struct device_node *controller, domain = controller ? irq_find_host(controller) : irq_default_domain; if (!domain) { +#ifdef CONFIG_MIPS + /* + * Workaround to avoid breaking interrupt controller drivers + * that don't yet register an irq_domain. This is temporary + * code. ~~~gcl, Feb 24, 2012 + * + * Scheduled for removal in Linux v3.6. That should be enough + * time. + */ + if (intsize > 0) + return intspec[0]; +#endif printk(KERN_WARNING "irq: no irq domain found for %s !\n", controller->full_name); return 0; -- cgit From 13ae246db4a02971ef4f557af1f6d3e21d64b710 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 29 Jan 2012 15:44:45 -0500 Subject: includecheck: delete any duplicate instances of module.h Different tree maintainers picked up independently generated trivial compile fixes based on linux-next testing, resulting in some cases where a file would have got more than one addition of module.h once everything was all merged together. Delete any duplicates so includecheck isn't complaining about anything related to module.h/export.h changes. Signed-off-by: Paul Gortmaker --- kernel/params.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 4bc965d8a1fe..47f5bf12434a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -15,7 +15,6 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include #include -- cgit From 05b4877f6a4f1ba4952d1222213d262bf8c132b7 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 17 Feb 2012 23:39:51 +0100 Subject: PM / Hibernate: Enable usermodehelpers in hibernate() error path If create_basic_memory_bitmaps() fails, usermodehelpers are not re-enabled before returning. Fix this. And while at it, reword the goto labels so that they look more meaningful. Signed-off-by: Srivatsa S. Bhat Cc: stable@vger.kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 72baaf011fb7..0a186cfde788 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -618,7 +618,7 @@ int hibernate(void) /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) - goto Exit; + goto Enable_umh; printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); @@ -626,7 +626,7 @@ int hibernate(void) error = freeze_processes(); if (error) - goto Finish; + goto Free_bitmaps; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error || freezer_test_done) @@ -659,8 +659,9 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; - Finish: + Free_bitmaps: free_basic_memory_bitmaps(); + Enable_umh: usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); -- cgit From 37f08be11be9a7d9351fb1b9b408259519a126f3 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Tue, 21 Feb 2012 23:57:47 +0100 Subject: PM / Freezer: Remove references to TIF_FREEZE in comments This patch removes all the references in the code about the TIF_FREEZE flag removed by commit a3201227f803ad7fd43180c5195dbe5a2bf998aa freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE There still are some references to TIF_FREEZE in Documentation/power/freezing-of-tasks.txt, but it looks like that documentation needs more thorough work to reflect how the new freezer works, and hence merely removing the references to TIF_FREEZE won't really help. So I have not touched that part in this patch. Suggested-by: Srivatsa S. Bhat Signed-off-by: Marcos Paulo de Souza Signed-off-by: Rafael J. Wysocki --- kernel/exit.c | 2 +- kernel/freezer.c | 6 +++--- kernel/power/process.c | 8 +++----- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 294b1709170d..fd0af05e0639 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -424,7 +424,7 @@ void daemonize(const char *name, ...) */ exit_mm(current); /* - * We don't want to have TIF_FREEZE set if the system-wide hibernation + * We don't want to get frozen, in case system-wide hibernation * or suspend transition begins right now. */ current->flags |= (PF_NOFREEZE | PF_KTHREAD); diff --git a/kernel/freezer.c b/kernel/freezer.c index 9815b8d1eed5..11f82a4d4eae 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p) * freeze_task - send a freeze request to given task * @p: task to send the request to * - * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE - * flag and either sending a fake signal to it or waking it up, depending - * on whether it has %PF_FREEZER_NOSIG set. + * If @p is freezing, the freeze request is sent either by sending a fake + * signal (if it's not a kernel thread) or waking it up (if it's a kernel + * thread). * * RETURNS: * %false, if @p is not freezing or already frozen; %true, otherwise diff --git a/kernel/power/process.c b/kernel/power/process.c index 6aeb5efe00eb..0d2aeb226108 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only) * It is "frozen enough". If the task does wake * up, it will immediately call try_to_freeze. * - * Because freeze_task() goes through p's - * scheduler lock after setting TIF_FREEZE, it's - * guaranteed that either we see TASK_RUNNING or - * try_to_stop() after schedule() in ptrace/signal - * stop sees TIF_FREEZE. + * Because freeze_task() goes through p's scheduler lock, it's + * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING + * transition can't race with task state testing here. */ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) -- cgit From e06ffa1ede4146cbc261d90f5dff3d63fe2e9d7a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 9 Mar 2012 18:03:20 +0800 Subject: workqueue: use percpu allocator for cwq on UP I notice that the commit bbddff makes percpu allocator can work on UP, So we don't need the magic way for UP. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bec7b5b53e03..5bbba094bfac 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -474,13 +474,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { - if (likely(cpu < nr_cpu_ids)) { -#ifdef CONFIG_SMP + if (likely(cpu < nr_cpu_ids)) return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); -#else - return wq->cpu_wq.single; -#endif - } } else if (likely(cpu == WORK_CPU_UNBOUND)) return wq->cpu_wq.single; return NULL; @@ -2897,13 +2892,8 @@ static int alloc_cwqs(struct workqueue_struct *wq) const size_t size = sizeof(struct cpu_workqueue_struct); const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, __alignof__(unsigned long long)); -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - if (percpu) + if (!(wq->flags & WQ_UNBOUND)) wq->cpu_wq.pcpu = __alloc_percpu(size, align); else { void *ptr; @@ -2927,13 +2917,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) static void free_cwqs(struct workqueue_struct *wq) { -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - - if (percpu) + if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->cpu_wq.pcpu); else if (wq->cpu_wq.single) { /* the pointer to free is stored right after the cwq */ -- cgit From 3047817b894ddae62be07787bc8735a616104398 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 9 Mar 2012 07:20:12 +0100 Subject: padata: Fix race in the serialization path When a padata object is queued to the serialization queue, another cpu might process and free the padata object. So don't dereference it after queueing to the serialization queue. Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index b45259931512..aa9929545855 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -230,6 +230,7 @@ out: static void padata_reorder(struct parallel_data *pd) { + int cb_cpu; struct padata_priv *padata; struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; @@ -270,13 +271,14 @@ static void padata_reorder(struct parallel_data *pd) return; } - squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); + cb_cpu = padata->cb_cpu; + squeue = per_cpu_ptr(pd->squeue, cb_cpu); spin_lock(&squeue->serial.lock); list_add_tail(&padata->list, &squeue->serial.list); spin_unlock(&squeue->serial.lock); - queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); + queue_work_on(cb_cpu, pinst->wq, &squeue->work); } spin_unlock_bh(&pd->lock); -- cgit From 2dc9b5dbdef09840de852a4f0cc6a9c9eece7220 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 9 Mar 2012 07:20:49 +0100 Subject: padata: Fix race on sequence number wrap When padata_do_parallel() is called from multiple cpus for the same padata instance, we can get object reordering on sequence number wrap because testing for sequence number wrap and reseting the sequence number must happen atomically but is implemented with two atomic operations. This patch fixes this by converting the sequence number from atomic_t to an unsigned int and protect the access with a spin_lock. As a side effect, we get rid of the sequence number wrap handling because the seqence number wraps back to null now without the need to do anything. Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index aa9929545855..6f10eb285ece 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -29,7 +29,6 @@ #include #include -#define MAX_SEQ_NR (INT_MAX - NR_CPUS) #define MAX_OBJ_NUM 1000 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) @@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) return target_cpu; } -static int padata_cpu_hash(struct padata_priv *padata) +static int padata_cpu_hash(struct parallel_data *pd) { int cpu_index; - struct parallel_data *pd; - - pd = padata->pd; /* * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); + + spin_lock(&pd->seq_lock); + cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); + pd->seq_nr++; + spin_unlock(&pd->seq_lock); return padata_index_to_cpu(pd, cpu_index); } @@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst, padata->pd = pd; padata->cb_cpu = cb_cpu; - if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) - atomic_set(&pd->seq_nr, -1); - - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - - target_cpu = padata_cpu_hash(padata); + target_cpu = padata_cpu_hash(pd); queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); @@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel); static struct padata_priv *padata_get_next(struct parallel_data *pd) { int cpu, num_cpus; - int next_nr, next_index; + unsigned int next_nr, next_index; struct padata_parallel_queue *queue, *next_queue; struct padata_priv *padata; struct padata_list *reorder; @@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) cpu = padata_index_to_cpu(pd, next_index); next_queue = per_cpu_ptr(pd->pqueue, cpu); - if (unlikely(next_nr > pd->max_seq_nr)) { - next_nr = next_nr - pd->max_seq_nr - 1; - next_index = next_nr % num_cpus; - cpu = padata_index_to_cpu(pd, next_index); - next_queue = per_cpu_ptr(pd->pqueue, cpu); - pd->processed = 0; - } - padata = NULL; reorder = &next_queue->reorder; @@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) padata = list_entry(reorder->list.next, struct padata_priv, list); - BUG_ON(next_nr != padata->seq_nr); - spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); @@ -402,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd) /* Initialize all percpu queues used by parallel workers */ static void padata_init_pqueues(struct parallel_data *pd) { - int cpu_index, num_cpus, cpu; + int cpu_index, cpu; struct padata_parallel_queue *pqueue; cpu_index = 0; @@ -417,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd) INIT_WORK(&pqueue->work, padata_parallel_worker); atomic_set(&pqueue->num_obj, 0); } - - num_cpus = cpumask_weight(pd->cpumask.pcpu); - pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; } /* Allocate and initialize the internal cpumask dependend resources. */ @@ -446,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = 0; atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; -- cgit From d762a50b5b1bb93e91cb3cd90b6ae133da98fe31 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 25 Nov 2011 23:14:38 +0800 Subject: kdb: remove the second argument of k[un]map_atomic() Signed-off-by: Cong Wang --- kernel/debug/kdb/kdb_support.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 7d6fb40d2188..d35cc2d3a4cc 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size) if (!pfn_valid(pfn)) return 1; page = pfn_to_page(pfn); - vaddr = kmap_atomic(page, KM_KDB); + vaddr = kmap_atomic(page); memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); - kunmap_atomic(vaddr, KM_KDB); + kunmap_atomic(vaddr); return 0; } -- cgit From 0de9a1e28a0d005f42c8cc5456a246710133b9ab Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 25 Nov 2011 23:14:38 +0800 Subject: power: remove the second argument of k[un]map_atomic() Acked-by: Rafael J. Wysocki Signed-off-by: Cong Wang --- kernel/power/snapshot.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 6a768e537001..3a564ac85f36 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1000,20 +1000,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page, KM_USER0); - dst = kmap_atomic(d_page, KM_USER1); + src = kmap_atomic(s_page); + dst = kmap_atomic(d_page); do_copy_page(dst, src); - kunmap_atomic(dst, KM_USER1); - kunmap_atomic(src, KM_USER0); + kunmap_atomic(dst); + kunmap_atomic(src); } else { if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page, KM_USER0); + dst = kmap_atomic(d_page); copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); } else { safe_copy_page(page_address(d_page), s_page); } @@ -1728,9 +1728,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2014,9 +2014,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page, KM_USER0); + dst = kmap_atomic(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); last_highmem_page = NULL; } } @@ -2309,13 +2309,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1, KM_USER0); - kaddr2 = kmap_atomic(p2, KM_USER1); + kaddr1 = kmap_atomic(p1); + kaddr2 = kmap_atomic(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2, KM_USER1); - kunmap_atomic(kaddr1, KM_USER0); + kunmap_atomic(kaddr2); + kunmap_atomic(kaddr1); } /** -- cgit From 5f8aadd8b9966d71a77bba52b9d499cc2f38269f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 14 Mar 2012 19:55:38 +0100 Subject: CLONE_PARENT shouldn't allow to set ->exit_signal The child must not control its ->exit_signal, it is the parent who decides which signal the child should use for notification. This means that CLONE_PARENT should not use "clone_flags & CSIGNAL", the forking task is the sibling of the new process and their parent doesn't control exit_signal in this case. This patch uses ->exit_signal of the forking process, but perhaps we should simply use SIGCHLD. We read group_leader->exit_signal lockless, this can race with the ORIGINAL_SIGNAL -> SIGCHLD transition, but this is fine. Potentially this change allows to kill self_exec_id/parent_exec_id. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/fork.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 26a7a6707fa7..c4f38a849436 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1340,7 +1340,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, clear_all_latency_tracing(p); /* ok, now we should be set up.. */ - p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else if (clone_flags & CLONE_PARENT) + p->exit_signal = current->group_leader->exit_signal; + else + p->exit_signal = (clone_flags & CSIGNAL); + p->pdeath_signal = 0; p->exit_state = 0; -- cgit From e636825346b36a07ccfc8e30946d52855e21f681 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 Mar 2012 17:03:22 +0100 Subject: exit_signal: simplify the "we have changed execution domain" logic exit_notify() checks "tsk->self_exec_id != tsk->parent_exec_id" to handle the "we have changed execution domain" case. We can change do_thread() to always set ->exit_signal = SIGCHLD and remove this check to simplify the code. We could change setup_new_exec() instead, this looks more logical because it increments ->self_exec_id. But note that de_thread() already resets ->exit_signal if it changes the leader, let's keep both changes close to each other. Note that we change ->exit_signal lockless, this changes the rules. Thereafter ->exit_signal is not stable under tasklist but this is fine, the only possible change is OLDSIG -> SIGCHLD. This can race with eligible_child() but the race is harmless. We can race with reparent_leader() which changes our ->exit_signal in parallel, but it does the same change to SIGCHLD. The noticeable user-visible change is that the execing task is not "visible" to do_wait()->eligible_child(__WCLONE) right after exec. To me this looks more logical, and this is consistent with mt case. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 752d2c0abd19..51ac4ced1313 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -827,14 +827,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * If the parent exec id doesn't match the exec id we saved * when we started then we know the parent has changed security * domain. - * - * If our self_exec id doesn't match our parent_exec_id then - * we have changed execution domain as these two values started - * the same after a fork. */ if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id)) + tsk->parent_exec_id != tsk->real_parent->self_exec_id) tsk->exit_signal = SIGCHLD; if (unlikely(tsk->ptrace)) { -- cgit From b6e238dceed36891cc633167afe7151f1f3d83c5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 Mar 2012 17:03:41 +0100 Subject: exit_signal: fix the "parent has changed security domain" logic exit_notify() changes ->exit_signal if the parent already did exec. This doesn't really work, we are not going to send the signal now if there is another live thread or the exiting task is traced. The parent can exec before the last dies or the tracer detaches. Move this check into do_notify_parent() which actually sends the signal. The user-visible change is that we do not change ->exit_signal, and thus the exiting task is still "clone children" for do_wait()->eligible_child(__WCLONE). Hopefully this is fine, the current logic is racy anyway. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 14 -------------- kernel/signal.c | 9 +++++++++ 2 files changed, 9 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 51ac4ced1313..ce5f758f40bd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -818,20 +818,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); - /* Let father know we died - * - * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitrary processes. - * That stops right now. - * - * If the parent exec id doesn't match the exec id we saved - * when we started then we know the parent has changed security - * domain. - */ - if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - tsk->parent_exec_id != tsk->real_parent->self_exec_id) - tsk->exit_signal = SIGCHLD; - if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && diff --git a/kernel/signal.c b/kernel/signal.c index 8511e39813c7..e76001ccf5cd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1652,6 +1652,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + if (sig != SIGCHLD) { + /* + * This is only possible if parent == real_parent. + * Check if it has changed security domain. + */ + if (tsk->parent_exec_id != tsk->parent->self_exec_id) + sig = SIGCHLD; + } + info.si_signo = sig; info.si_errno = 0; /* -- cgit From 48fde701aff662559b38d9a609574068f22d00fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Jan 2012 22:15:13 -0500 Subject: switch open-coded instances of d_make_root() to new helper Signed-off-by: Al Viro --- kernel/cgroup.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5d3b5325f77..711c1a30ceaa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb) struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); - struct dentry *dentry; if (!inode) return -ENOMEM; @@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb) inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - dentry = d_alloc_root(inode); - if (!dentry) { - iput(inode); + sb->s_root = d_make_root(inode); + if (!sb->s_root) return -ENOMEM; - } - sb->s_root = dentry; /* for everything else we want ->d_op set */ sb->s_d_op = &cgroup_dops; return 0; -- cgit From 66b3fad3f4c535c92b6a1184d535a97d6aa5d82a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Mar 2012 21:48:20 -0400 Subject: constify path argument of audit_log_d_path() Signed-off-by: Al Viro --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index bb0eb5bb9a0a..1c7f2c61416b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct path *path) + const struct path *path) { char *p, *pathname; -- cgit From 38eff2892628fa5c4fc8962a17b7296f42833ebe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Mar 2012 21:51:10 -0400 Subject: constify path argument of trace_seq_path() Signed-off-by: Al Viro --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff3555942..690987198ad7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) return ret; } -int trace_seq_path(struct trace_seq *s, struct path *path) +int trace_seq_path(struct trace_seq *s, const struct path *path) { unsigned char *p; -- cgit From c3f0327f8e9d7a503f0d64573c311eddd61f197d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 21 Mar 2012 16:33:48 -0700 Subject: mm: add rss counters consistency check Warn about non-zero rss counters at final mmdrop. This check will prevent reoccurences of bugs such as that fixed in "mm: fix rss count leakage during migration". I didn't hide this check under CONFIG_VM_DEBUG because it rather small and rss counters cover whole page-table management, so this is a good invariant. Signed-off-by: Konstantin Khlebnikov Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c4f38a849436..a9e99f3c18e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -511,6 +511,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) return NULL; } +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif +} + /* * Allocate and initialize an mm_struct. */ @@ -538,9 +555,7 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON(mm->pmd_huge_pte); -#endif + check_mm(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); -- cgit From cc9a6c8776615f9c194ccf0b63a0aa5628235545 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Mar 2012 16:34:11 -0700 Subject: cpuset: mm: reduce large amounts of memory barrier related damage v3 Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") wins a super prize for the largest number of memory barriers entered into fast paths for one commit. [get|put]_mems_allowed is incredibly heavy with pairs of full memory barriers inserted into a number of hot paths. This was detected while investigating at large page allocator slowdown introduced some time after 2.6.32. The largest portion of this overhead was shown by oprofile to be at an mfence introduced by this commit into the page allocator hot path. For extra style points, the commit introduced the use of yield() in an implementation of what looks like a spinning mutex. This patch replaces the full memory barriers on both read and write sides with a sequence counter with just read barriers on the fast path side. This is much cheaper on some architectures, including x86. The main bulk of the patch is the retry logic if the nodemask changes in a manner that can cause a false failure. While updating the nodemask, a check is made to see if a false failure is a risk. If it is, the sequence number gets bumped and parallel allocators will briefly stall while the nodemask update takes place. In a page fault test microbenchmark, oprofile samples from __alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The actual results were 3.3.0-rc3 3.3.0-rc3 rc3-vanilla nobarrier-v2r1 Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%) Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%) Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%) Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%) Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%) Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%) Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%) Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%) Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%) Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%) Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%) Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%) Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%) Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%) Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%) MMTests Statistics: duration Sys Time Running Test (seconds) 135.68 132.17 User+Sys Time Running Test (seconds) 164.2 160.13 Total Elapsed Time (seconds) 123.46 120.87 The overall improvement is small but the System CPU time is much improved and roughly in correlation to what oprofile reported (these performance figures are without profiling so skew is expected). The actual number of page faults is noticeably improved. For benchmarks like kernel builds, the overall benefit is marginal but the system CPU time is slightly reduced. To test the actual bug the commit fixed I opened two terminals. The first ran within a cpuset and continually ran a small program that faulted 100M of anonymous data. In a second window, the nodemask of the cpuset was continually randomised in a loop. Without the commit, the program would fail every so often (usually within 10 seconds) and obviously with the commit everything worked fine. With this patch applied, it also worked fine so the fix should be functionally equivalent. Signed-off-by: Mel Gorman Cc: Miao Xie Cc: David Rientjes Cc: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 43 ++++++++----------------------------------- kernel/fork.c | 1 + 2 files changed, 9 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5d575836dba6..1010cc61931f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; -repeat: /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. @@ -983,45 +982,19 @@ repeat: */ need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* - * ensure checking ->mems_allowed_change_disable after setting all new - * allowed nodes. - * - * the read-side task can see an nodemask with new allowed nodes and - * old allowed nodes. and if it allocates page when cpuset clears newly - * disallowed ones continuous, it can see the new allowed bits. - * - * And if setting all new allowed nodes is after the checking, setting - * all new allowed nodes and clearing newly disallowed ones will be done - * continuous, and the read-side task may find no node to alloc page. - */ - smp_mb(); + if (need_loop) + write_seqcount_begin(&tsk->mems_allowed_seq); - /* - * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. - */ - while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { - task_unlock(tsk); - if (!task_curr(tsk)) - yield(); - goto repeat; - } - - /* - * ensure checking ->mems_allowed_change_disable before clearing all new - * disallowed nodes. - * - * if clearing newly disallowed bits before the checking, the read-side - * task may find no node to alloc page. - */ - smp_mb(); + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + + if (need_loop) + write_seqcount_end(&tsk->mems_allowed_seq); + task_unlock(tsk); } diff --git a/kernel/fork.c b/kernel/fork.c index a9e99f3c18e0..9cc227d54102 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1237,6 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; + seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; -- cgit From 05af2e104a0c282dcd9303431e1360750ba76de6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 21 Mar 2012 16:34:13 -0700 Subject: mm, counters: remove task argument to sync_mm_rss() and __sync_task_rss_stat() sync_mm_rss() can only be used for current to avoid race conditions in iterating and clearing its per-task counters. Remove the task argument for it and its helper function, __sync_task_rss_stat(), to avoid thinking it can be used safely for anything other than current. Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 0ed15fed579f..d26acd3c1e2e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -934,7 +934,7 @@ void do_exit(long code) acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) - sync_mm_rss(tsk, tsk->mm); + sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); -- cgit From 42aee6c495e07dba7410b863a360db6bb9ec6d66 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 21 Mar 2012 16:34:21 -0700 Subject: cgroup: revert ss_id_lock to spinlock Commit c1e2ee2dc436 ("memcg: replace ss->id_lock with a rwlock") has now been seen to cause the unfair behavior we should have expected from converting a spinlock to an rwlock: softlockup in cgroup_mkdir(), whose get_new_cssid() is waiting for the wlock, while there are 19 tasks using the rlock in css_get_next() to get on with their memcg workload (in an artificial test, admittedly). Yet lib/idr.c was made suitable for RCU way back: revert that commit, restoring ss->id_lock to a spinlock. Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Eric Dumazet Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c6877fe9a831..8eb90f25bd7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4885,9 +4885,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4913,10 +4913,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4931,9 +4931,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4945,7 +4945,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - rwlock_init(&ss->id_lock); + spin_lock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5040,9 +5040,9 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - read_lock(&ss->id_lock); + spin_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - read_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); if (!tmp) break; -- cgit From ca464d69b19120a826aa2534de2511a6f542edf5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 21 Mar 2012 16:34:21 -0700 Subject: memcg: let css_get_next() rely upon rcu_read_lock() Remove lock and unlock around css_get_next()'s call to idr_get_next(). memcg iterators (only users of css_get_next) already did rcu_read_lock(), and its comment demands that; but add a WARN_ON_ONCE to make sure of it. Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Eric Dumazet Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8eb90f25bd7b..391d5e991e5f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5033,6 +5033,8 @@ css_get_next(struct cgroup_subsys *ss, int id, return NULL; BUG_ON(!ss->use_id); + WARN_ON_ONCE(!rcu_read_lock_held()); + /* fill start point for scan */ tmpid = id; while (1) { @@ -5040,10 +5042,7 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - spin_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - spin_unlock(&ss->id_lock); - if (!tmp) break; if (tmp->depth >= depth && tmp->stack[depth] == rootid) { -- cgit From 9fbe465efc76044dd87afe764db5464ae61aeabc Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 16 Mar 2012 13:17:13 +0100 Subject: kgdb: Respect that flush op is optional Not all kgdb I/O drivers implement a flush operation. Adjust gdbstub_exit accordingly. Signed-off-by: Jan Kiszka Signed-off-by: Jason Wessel --- kernel/debug/gdbstub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index c22d8c28ad84..5a155742ae96 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1129,5 +1129,6 @@ void gdbstub_exit(int status) dbg_io_ops->write_char(hex_asc_lo(checksum)); /* make sure the output is flushed, lest the bootloader clobber it */ - dbg_io_ops->flush(); + if (dbg_io_ops->flush) + dbg_io_ops->flush(); } -- cgit From 2366e047840e33928803c0442176fb3991423da8 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 16 Mar 2012 14:20:41 -0500 Subject: kgdb,debug-core,gdbstub: Hook the reboot notifier for debugger detach The gdbstub and kdb should get detached if the system is rebooting. Calling gdbstub_exit() will set the proper debug core state and send a message to any debugger that is connected to correctly detach. An attached debugger will receive the exit code from include/linux/reboot.h based on SYS_HALT, SYS_REBOOT, etc... Reported-by: Jan Kiszka Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 17 +++++++++++++++++ kernel/debug/gdbstub.c | 7 +++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0d7c08784efb..3c1ad4e03543 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -784,6 +785,20 @@ void __init dbg_late_init(void) kdb_init(KDB_INIT_FULL); } +static int +dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x) +{ + if (!dbg_kdb_mode) + gdbstub_exit(code); + return NOTIFY_DONE; +} + +static struct notifier_block dbg_reboot_notifier = { + .notifier_call = dbg_notify_reboot, + .next = NULL, + .priority = INT_MAX, +}; + static void kgdb_register_callbacks(void) { if (!kgdb_io_module_registered) { @@ -791,6 +806,7 @@ static void kgdb_register_callbacks(void) kgdb_arch_init(); if (!dbg_is_early) kgdb_arch_late(); + register_reboot_notifier(&dbg_reboot_notifier); atomic_notifier_chain_register(&panic_notifier_list, &kgdb_panic_event_nb); #ifdef CONFIG_MAGIC_SYSRQ @@ -812,6 +828,7 @@ static void kgdb_unregister_callbacks(void) */ if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; + unregister_reboot_notifier(&dbg_reboot_notifier); atomic_notifier_chain_unregister(&panic_notifier_list, &kgdb_panic_event_nb); kgdb_arch_exit(); diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 5a155742ae96..ce615e064482 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1111,6 +1111,13 @@ void gdbstub_exit(int status) unsigned char checksum, ch, buffer[3]; int loop; + if (!kgdb_connected) + return; + kgdb_connected = 0; + + if (!dbg_io_ops || dbg_kdb_mode) + return; + buffer[0] = 'W'; buffer[1] = hex_asc_hi(status); buffer[2] = hex_asc_lo(status); -- cgit From 8f30d411767351656ea62c9e7612120f9b870b59 Mon Sep 17 00:00:00 2001 From: Andrei Warkentin Date: Tue, 28 Feb 2012 06:55:05 -0600 Subject: KDB: Fix usability issues relating to the 'enter' key. This fixes the following problems: 1) Typematic-repeat of 'enter' gives warning message and leaks make/break if KDB exits. Repeats look something like 0x1c 0x1c .... 0x9c 2) Use of 'keypad enter' gives warning message and leaks the ENTER break/make code out if KDB exits. KP ENTER repeats look someting like 0xe0 0x1c 0xe0 0x1c ... 0xe0 0x9c. 3) Lag on the order of seconds between "break" and "make" when expecting the enter "break" code. Seen under virtualized environments such as VMware ESX. The existing special enter handler tries to glob the enter break code, but this fails if the other (KP) enter was used, or if there was a key repeat. It also fails if you mashed some keys along with enter, and you ended up with a non-enter make or non-enter break code coming after the enter make code. So first, we modify the handler to handle these cases. But performing these actions on every enter is annoying since now you can't hold ENTER down to scroll d messages in KDB. Since this special behaviour is only necessary to handle the exiting KDB ('g' + ENTER) without leaking scancodes to the OS. This cleanup needs to get executed anytime the kdb_main loop exits. Tested on QEMU. Set a bp on atkbd.c to verify no scan code was leaked. Cc: Andrei Warkentin [jason.wessel@windriver.com: move cleanup calls to kdb_main.c] Signed-off-by: Andrei Warkentin Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_keyboard.c | 95 +++++++++++++++++++++++++++++++---------- kernel/debug/kdb/kdb_main.c | 3 ++ kernel/debug/kdb/kdb_private.h | 7 +++ 3 files changed, 83 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 4bca634975c0..118527aa60ea 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -25,6 +25,7 @@ #define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ static int kbd_exists; +static int kbd_last_ret; /* * Check if the keyboard controller has a keypress for us. @@ -90,8 +91,11 @@ int kdb_get_kbd_char(void) return -1; } - if ((scancode & 0x80) != 0) + if ((scancode & 0x80) != 0) { + if (scancode == 0x9c) + kbd_last_ret = 0; return -1; + } scancode &= 0x7f; @@ -178,35 +182,82 @@ int kdb_get_kbd_char(void) return -1; /* ignore unprintables */ } - if ((scancode & 0x7f) == 0x1c) { - /* - * enter key. All done. Absorb the release scancode. - */ + if (scancode == 0x1c) { + kbd_last_ret = 1; + return 13; + } + + return keychar & 0xff; +} +EXPORT_SYMBOL_GPL(kdb_get_kbd_char); + +/* + * Best effort cleanup of ENTER break codes on leaving KDB. Called on + * exiting KDB, when we know we processed an ENTER or KP ENTER scan + * code. + */ +void kdb_kbd_cleanup_state(void) +{ + int scancode, scanstatus; + + /* + * Nothing to clean up, since either + * ENTER was never pressed, or has already + * gotten cleaned up. + */ + if (!kbd_last_ret) + return; + + kbd_last_ret = 0; + /* + * Enter key. Need to absorb the break code here, lest it gets + * leaked out if we exit KDB as the result of processing 'g'. + * + * This has several interesting implications: + * + Need to handle KP ENTER, which has break code 0xe0 0x9c. + * + Need to handle repeat ENTER and repeat KP ENTER. Repeats + * only get a break code at the end of the repeated + * sequence. This means we can't propagate the repeated key + * press, and must swallow it away. + * + Need to handle possible PS/2 mouse input. + * + Need to handle mashed keys. + */ + + while (1) { while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) - ; + cpu_relax(); /* - * Fetch the scancode + * Fetch the scancode. */ scancode = inb(KBD_DATA_REG); scanstatus = inb(KBD_STATUS_REG); - while (scanstatus & KBD_STAT_MOUSE_OBF) { - scancode = inb(KBD_DATA_REG); - scanstatus = inb(KBD_STATUS_REG); - } + /* + * Skip mouse input. + */ + if (scanstatus & KBD_STAT_MOUSE_OBF) + continue; - if (scancode != 0x9c) { - /* - * Wasn't an enter-release, why not? - */ - kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", - scancode, scanstatus); - } + /* + * If we see 0xe0, this is either a break code for KP + * ENTER, or a repeat make for KP ENTER. Either way, + * since the second byte is equivalent to an ENTER, + * skip the 0xe0 and try again. + * + * If we see 0x1c, this must be a repeat ENTER or KP + * ENTER (and we swallowed 0xe0 before). Try again. + * + * We can also see make and break codes for other keys + * mashed before or after pressing ENTER. Thus, if we + * see anything other than 0x9c, we have to try again. + * + * Note, if you held some key as ENTER was depressed, + * that break code would get leaked out. + */ + if (scancode != 0x9c) + continue; - return 13; + return; } - - return keychar & 0xff; } -EXPORT_SYMBOL_GPL(kdb_get_kbd_char); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index e2ae7349437f..67b847dfa2bb 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, if (KDB_STATE(DOING_SS)) KDB_STATE_CLEAR(SSBPT); + /* Clean up any keyboard devices before leaving */ + kdb_kbd_cleanup_state(); + return result; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index e381d105b40b..47c4e56e513b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -246,6 +246,13 @@ extern void debug_kusage(void); extern void kdb_set_current_task(struct task_struct *); extern struct task_struct *kdb_current_task; + +#ifdef CONFIG_KDB_KEYBOARD +extern void kdb_kbd_cleanup_state(void); +#else /* ! CONFIG_KDB_KEYBOARD */ +#define kdb_kbd_cleanup_state() +#endif /* ! CONFIG_KDB_KEYBOARD */ + #ifdef CONFIG_MODULES extern struct list_head *kdb_modules; #endif /* CONFIG_MODULES */ -- cgit From bec4d62ead8096e433d624d9339893f50badd992 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 19 Mar 2012 19:35:55 -0500 Subject: kgdb,debug_core: add the ability to control the reboot notifier Sometimes it is desirable to stop the kernel debugger before allowing a system to reboot either with kdb or kgdb. This patch adds the ability to turn the reboot notifier on and off or enter the debugger and stop kernel execution before rebooting. It is possible to change the setting after booting the kernel with the following: echo 1 > /sys/module/debug_core/parameters/kgdbreboot It is also possible to change this setting using kdb / kgdb to manipulate the variable directly. Using KDB: mm kgdbreboot 1 Using gdb: set kgdbreboot=1 Reported-by: Jan Kiszka Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 3c1ad4e03543..3f88a45e6f0a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -76,6 +76,8 @@ static int exception_level; struct kgdb_io *dbg_io_ops; static DEFINE_SPINLOCK(kgdb_registration_lock); +/* Action for the reboot notifiter, a global allow kdb to change it */ +static int kgdbreboot; /* kgdb console driver is loaded */ static int kgdb_con_registered; /* determine if kgdb console output should be used */ @@ -97,6 +99,7 @@ static int __init opt_kgdb_con(char *str) early_param("kgdbcon", opt_kgdb_con); module_param(kgdb_use_con, int, 0644); +module_param(kgdbreboot, int, 0644); /* * Holds information about breakpoints in a kernel. These breakpoints are @@ -788,8 +791,21 @@ void __init dbg_late_init(void) static int dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { + /* + * Take the following action on reboot notify depending on value: + * 1 == Enter debugger + * 0 == [the default] detatch debug client + * -1 == Do nothing... and use this until the board resets + */ + switch (kgdbreboot) { + case 1: + kgdb_breakpoint(); + case -1: + goto done; + } if (!dbg_kdb_mode) gdbstub_exit(code); +done: return NOTIFY_DONE; } -- cgit From b8adde8ddec9ff62a21564fa8020b5463e70d4de Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Wed, 21 Sep 2011 13:19:12 -0700 Subject: kdb: Avoid using dbg_io_ops until it is initialized This fixes a bug with setting a breakpoint during kdb initialization (from kdb_cmds). Any call to kdb_printf() before the initialization of the kgdboc serial console driver (which happens much later during bootup than kdb_init), results in kernel panic due to the use of dbg_io_ops before it is initialized. Signed-off-by: Tim Bird Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 4802eb5840e1..9b5f17da1c56 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -689,7 +689,7 @@ kdb_printit: if (!dbg_kdb_mode && kgdb_connected) { gdbstub_msg_write(kdb_buffer, retlen); } else { - if (!dbg_io_ops->is_console) { + if (dbg_io_ops && !dbg_io_ops->is_console) { len = strlen(kdb_buffer); cp = kdb_buffer; while (len--) { -- cgit From 1ba0c1720eb0de2d0f3abf84c0b128d10af520d1 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 21 Sep 2011 13:07:47 -0700 Subject: kdb: Add message about CONFIG_DEBUG_RODATA on failure to install breakpoint On x86, if CONFIG_DEBUG_RODATA is set, one cannot set breakpoints via KDB. Apparently this is a well-known problem, as at least one distribution now ships with both KDB enabled and CONFIG_DEBUG_RODATA=y for security reasons. This patch adds an printk message to the breakpoint failure case, in order to provide suggestions about how to use the debugger. Reported-by: Tim Bird Signed-off-by: Jason Wessel Acked-by: Tim Bird --- kernel/debug/kdb/kdb_bp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 20059ef4459a..8418c2f8ec5d 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) } else { kdb_printf("%s: failed to set breakpoint at 0x%lx\n", __func__, bp->bp_addr); +#ifdef CONFIG_DEBUG_RODATA + if (!bp->bp_type) { + kdb_printf("Software breakpoints are unavailable.\n" + " Change the kernel CONFIG_DEBUG_RODATA=n\n" + " OR use hw breaks: help bph\n"); + } +#endif return 1; } return 0; -- cgit From ebec18a6d3aa1e7d84aab16225e87fd25170ec2b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Mar 2012 15:01:54 -0700 Subject: prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process supervision Userspace service managers/supervisors need to track their started services. Many services daemonize by double-forking and get implicitly re-parented to PID 1. The service manager will no longer be able to receive the SIGCHLD signals for them, and is no longer in charge of reaping the children with wait(). All information about the children is lost at the moment PID 1 cleans up the re-parented processes. With this prctl, a service manager process can mark itself as a sort of 'sub-init', able to stay as the parent for all orphaned processes created by the started services. All SIGCHLD signals will be delivered to the service manager. Receiving SIGCHLD and doing wait() is in cases of a service-manager much preferred over any possible asynchronous notification about specific PIDs, because the service manager has full access to the child process data in /proc and the PID can not be re-used until the wait(), the service-manager itself is in charge of, has happened. As a side effect, the relevant parent PID information does not get lost by a double-fork, which results in a more elaborate process tree and 'ps' output: before: # ps afx 253 ? Ss 0:00 /bin/dbus-daemon --system --nofork 294 ? Sl 0:00 /usr/libexec/polkit-1/polkitd 328 ? S 0:00 /usr/sbin/modem-manager 608 ? Sl 0:00 /usr/libexec/colord 658 ? Sl 0:00 /usr/libexec/upowerd 819 ? Sl 0:00 /usr/libexec/imsettings-daemon 916 ? Sl 0:00 /usr/libexec/udisks-daemon 917 ? S 0:00 \_ udisks-daemon: not polling any devices after: # ps afx 294 ? Ss 0:00 /bin/dbus-daemon --system --nofork 426 ? Sl 0:00 \_ /usr/libexec/polkit-1/polkitd 449 ? S 0:00 \_ /usr/sbin/modem-manager 635 ? Sl 0:00 \_ /usr/libexec/colord 705 ? Sl 0:00 \_ /usr/libexec/upowerd 959 ? Sl 0:00 \_ /usr/libexec/udisks-daemon 960 ? S 0:00 | \_ udisks-daemon: not polling any devices 977 ? Sl 0:00 \_ /usr/libexec/packagekitd This prctl is orthogonal to PID namespaces. PID namespaces are isolated from each other, while a service management process usually requires the services to live in the same namespace, to be able to talk to each other. Users of this will be the systemd per-user instance, which provides init-like functionality for the user's login session and D-Bus, which activates bus services on-demand. Both need init-like capabilities to be able to properly keep track of the services they start. Many thanks to Oleg for several rounds of review and insights. [akpm@linux-foundation.org: fix comment layout and spelling] [akpm@linux-foundation.org: add lengthy code comment from Oleg] Reviewed-by: Oleg Nesterov Signed-off-by: Lennart Poettering Signed-off-by: Kay Sievers Acked-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 33 ++++++++++++++++++++++++++++----- kernel/fork.c | 3 +++ kernel/sys.c | 8 ++++++++ 3 files changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 16b07bfac224..456329fd4ea3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -687,11 +687,11 @@ static void exit_mm(struct task_struct * tsk) } /* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. + * When we die, we re-parent all our children, and try to: + * 1. give them to another thread in our thread group, if such a member exists + * 2. give it to the first ancestor process which prctl'd itself as a + * child_subreaper for its children (like a service manager) + * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father) __releases(&tasklist_lock) @@ -722,6 +722,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father) * forget_original_parent() must move them somewhere. */ pid_ns->child_reaper = init_pid_ns.child_reaper; + } else if (father->signal->has_child_subreaper) { + struct task_struct *reaper; + + /* + * Find the first ancestor marked as child_subreaper. + * Note that the code below checks same_thread_group(reaper, + * pid_ns->child_reaper). This is what we need to DTRT in a + * PID namespace. However we still need the check above, see + * http://marc.info/?l=linux-kernel&m=131385460420380 + */ + for (reaper = father->real_parent; + reaper != &init_task; + reaper = reaper->real_parent) { + if (same_thread_group(reaper, pid_ns->child_reaper)) + break; + if (!reaper->signal->is_child_subreaper) + continue; + thread = reaper; + do { + if (!(thread->flags & PF_EXITING)) + return reaper; + } while_each_thread(reaper, thread); + } } return pid_ns->child_reaper; diff --git a/kernel/fork.c b/kernel/fork.c index 37674ec55cde..b9372a0bff18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1051,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; + sig->has_child_subreaper = current->signal->has_child_subreaper || + current->signal->is_child_subreaper; + mutex_init(&sig->cred_guard_mutex); return 0; diff --git a/kernel/sys.c b/kernel/sys.c index 888d227fd195..9eb7fcab8df6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_SET_CHILD_SUBREAPER: + me->signal->is_child_subreaper = !!arg2; + error = 0; + break; + case PR_GET_CHILD_SUBREAPER: + error = put_user(me->signal->is_child_subreaper, + (int __user *) arg2); + break; default: error = -EINVAL; break; -- cgit From 397a21f24d455982a8a6f9bc11b5f3326ce3c6ef Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:01:54 -0700 Subject: kernel/exit.c: if init dies, log a signal which killed it, if any I just received another user's pleas for help when their init mysteriously died. I again explained that they need to check whether it died because of bad instruction, a segv, or something else. Which was an annoying detour into writing a trivial C program to spawn his init and print its exit code: http://lists.busybox.net/pipermail/busybox/2012-January/077172.html I hear you saying "just test it under /bin/sh". Well, the crashing init _was_ /bin/sh. Which prompted me to make kernel do this first step automatically. We can print exit code, which makes it possible to see that death was from e.g. SIGILL without writing test programs. [akpm@linux-foundation.org: add 0x to hex number output] Signed-off-by: Denys Vlasenko Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 456329fd4ea3..3db1909faed9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -711,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father) if (unlikely(pid_ns->child_reaper == father)) { write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) - panic("Attempted to kill init!"); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: + father->exit_code); + } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); -- cgit From 7a05c0f7bbae91d08b7d0acf016fdb42dbc912ae Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 23 Mar 2012 15:01:55 -0700 Subject: watchdog: make sure the watchdog thread gets CPU on loaded system If the system is loaded while hotplugging a CPU we might end up with a bogus hardlockup detection. This has been seen during LTP pounder test executed in parallel with hotplug test. The main problem is that enable_watchdog (called when CPU is brought up) registers perf event which periodically checks per-cpu counter (hrtimer_interrupts), updated from a hrtimer callback, but the hrtimer is fired from the kernel thread. This means that while we already do check for the hard lockup the kernel thread might be sitting on the runqueue with zillions of tasks so there is nobody to update the value we rely on and so we KABOOM. Let's fix this by boosting the watchdog thread priority before we wake it up rather than when it's already running. This still doesn't handle a case where we have the same amount of high prio FIFO tasks but that doesn't seem to be common. The current implementation doesn't handle that case anyway so this is not worse at least. Unfortunately, we cannot start perf counter from the watchdog thread because we could miss a real lock up and also we cannot start the hrtimer watchdog_enable because we there is no way (at least I don't know any) to start a hrtimer from a different CPU. [dzickus@redhat.com: fix compile issue with param] Cc: Ingo Molnar Cc: Peter Zijlstra Reviewed-by: Mandeep Singh Baines Signed-off-by: Michal Hocko Signed-off-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14bc092fb12c..203fc6e1a285 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -319,11 +319,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param = { .sched_priority = 0 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - sched_setscheduler(current, SCHED_FIFO, ¶m); - /* initialize timestamp */ __touch_watchdog(); @@ -350,7 +348,6 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); - param.sched_priority = 0; sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; } @@ -439,6 +436,7 @@ static int watchdog_enable(int cpu) /* create the watchdog thread */ if (!p) { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); @@ -450,6 +448,7 @@ static int watchdog_enable(int cpu) } goto out; } + sched_setscheduler(p, SCHED_FIFO, ¶m); kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; per_cpu(softlockup_watchdog, cpu) = p; -- cgit From 4501980aae221ed8120dee3491f799ecd75187ad Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Mar 2012 15:01:55 -0700 Subject: kernel/watchdog.c: convert to pr_foo() It fixes some 80-col wordwrappings and adds some consistency. Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 203fc6e1a285..a01cb03b045a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -9,6 +9,8 @@ * to those contributors as well. */ +#define pr_fmt(fmt) "NMI watchdog: " fmt + #include #include #include @@ -373,18 +375,20 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (!IS_ERR(event)) { - printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + pr_info("enabled, takes one hw-pmu counter.\n"); goto out_save; } /* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) - printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); + pr_warning("disabled (cpu%i): hardware events not enabled\n", + cpu); else - printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); + pr_err("disabled (cpu%i): unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); return PTR_ERR(event); /* success path */ @@ -439,7 +443,7 @@ static int watchdog_enable(int cpu) struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); if (IS_ERR(p)) { - printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + pr_err("softlockup watchdog for %i failed\n", cpu); if (!err) { /* if hardlockup hasn't already set this */ err = PTR_ERR(p); @@ -495,7 +499,7 @@ static void watchdog_enable_all_cpus(void) watchdog_enabled = 1; if (!watchdog_enabled) - printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + pr_err("failed to be enabled on some cpus\n"); } -- cgit From b60f796c4ca72545327a069f12938360d833cce7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Mar 2012 15:01:56 -0700 Subject: kernel/watchdog.c: add comment to watchdog() exit path Revelation from Peter. Cc: Peter Zijlstra Cc: Don Zickus Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a01cb03b045a..df30ee08bdd4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -349,6 +349,10 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } + /* + * Drop the policy/priority elevation during thread exit to avoid a + * scheduling latency spike. + */ __set_current_state(TASK_RUNNING); sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; -- cgit From 8c5cf9e5c50dc902713897e10201aa71f3546aa1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:40 -0700 Subject: ptrace: don't modify flags on PTRACE_SETOPTIONS failure On ptrace(PTRACE_SETOPTIONS, pid, 0, ), we used to set those option bits which are known, and then fail with -EINVAL if there are some unknown bits in . This is inconsistent with typical error handling, which does not change any state if input is invalid. This patch changes PTRACE_SETOPTIONS behavior so that in this case, we return -EINVAL and don't change any bits in task->ptrace. It's very unlikely that there is userspace code in the wild which will be affected by this change: it should have the form ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_BOGUSOPT) where PTRACE_O_BOGUSOPT is a constant unknown to the kernel. But kernel headers, naturally, don't contain any PTRACE_O_BOGUSOPTs, thus the only way userspace can use one if it defines one itself. I can't see why anyone would do such a thing deliberately. Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Reviewed-by: Oleg Nesterov Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 00ab2ca5ed11..273f56ea39d2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -528,6 +528,9 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { + if (data & ~(unsigned long)PTRACE_O_MASK) + return -EINVAL; + child->ptrace &= ~PT_TRACE_MASK; if (data & PTRACE_O_TRACESYSGOOD) @@ -551,7 +554,7 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & PTRACE_O_TRACEEXIT) child->ptrace |= PT_TRACE_EXIT; - return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; + return 0; } static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) -- cgit From 86b6c1f301faf085de5a3f9ce16b8de6e69c729b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:41 -0700 Subject: ptrace: simplify PTRACE_foo constants and PTRACE_SETOPTIONS code Exchange PT_TRACESYSGOOD and PT_PTRACE_CAP bit positions, which makes PT_option bits contiguous and therefore makes code in ptrace_setoptions() much simpler. Every PTRACE_O_TRACEevent is defined to (1 << PTRACE_EVENT_event) instead of using explicit numeric constants, to ensure we don't mess up relationship between bit positions and event ids. PT_EVENT_FLAG_SHIFT was not particularly useful, PT_OPT_FLAG_SHIFT with value of PT_EVENT_FLAG_SHIFT-1 is easier to use. PT_TRACE_MASK constant is nuked, the only its use is replaced by (PTRACE_O_MASK << PT_OPT_FLAG_SHIFT). Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Reviewed-by: Oleg Nesterov Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 273f56ea39d2..9acd07a6f5bb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -262,7 +262,7 @@ static int ptrace_attach(struct task_struct *task, long request, /* * Protect exec's credential calculations against our interference; - * interference; SUID, SGID and LSM creds get determined differently + * SUID, SGID and LSM creds get determined differently * under ptrace. */ retval = -ERESTARTNOINTR; @@ -528,31 +528,16 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { + unsigned flags; + if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; - child->ptrace &= ~PT_TRACE_MASK; - - if (data & PTRACE_O_TRACESYSGOOD) - child->ptrace |= PT_TRACESYSGOOD; - - if (data & PTRACE_O_TRACEFORK) - child->ptrace |= PT_TRACE_FORK; - - if (data & PTRACE_O_TRACEVFORK) - child->ptrace |= PT_TRACE_VFORK; - - if (data & PTRACE_O_TRACECLONE) - child->ptrace |= PT_TRACE_CLONE; - - if (data & PTRACE_O_TRACEEXEC) - child->ptrace |= PT_TRACE_EXEC; - - if (data & PTRACE_O_TRACEVFORKDONE) - child->ptrace |= PT_TRACE_VFORK_DONE; - - if (data & PTRACE_O_TRACEEXIT) - child->ptrace |= PT_TRACE_EXIT; + /* Avoid intermediate state when all opts are cleared */ + flags = child->ptrace; + flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); + flags |= (data << PT_OPT_FLAG_SHIFT); + child->ptrace = flags; return 0; } -- cgit From aa9147c98f27550bd39416eca5a5844e54bced26 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:42 -0700 Subject: ptrace: make PTRACE_SEIZE set ptrace options specified in 'data' parameter This can be used to close a few corner cases in strace where we get unwanted racy behavior after attach, but before we have a chance to set options (the notorious post-execve SIGTRAP comes to mind), and removes the need to track "did we set opts for this task" state in strace internals. While we are at it: Make it possible to extend SEIZE in the future with more functionality by passing non-zero 'addr' parameter. To that end, error out if 'addr' is non-zero. PTRACE_ATTACH did not (and still does not) have such check, and users (strace) do pass garbage there... let's avoid repeating this mistake with SEIZE. Set all task->ptrace bits in one operation - before this change, we were adding PT_SEIZED and PT_PTRACE_CAP with task->ptrace |= BIT ops. This was probably ok (not a bug), but let's be on a safer side. Changes since v2: use (unsigned long) casts instead of (long) ones, move PTRACE_SEIZE_DEVEL-related code to separate lines of code. Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Cc: Pedro Alves Reviewed-by: Oleg Nesterov Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9acd07a6f5bb..4661c5bc07e5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -231,6 +231,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) } static int ptrace_attach(struct task_struct *task, long request, + unsigned long addr, unsigned long flags) { bool seize = (request == PTRACE_SEIZE); @@ -238,19 +239,29 @@ static int ptrace_attach(struct task_struct *task, long request, /* * SEIZE will enable new ptrace behaviors which will be implemented - * gradually. SEIZE_DEVEL is used to prevent applications + * gradually. SEIZE_DEVEL bit is used to prevent applications * expecting full SEIZE behaviors trapping on kernel commits which * are still in the process of implementing them. * * Only test programs for new ptrace behaviors being implemented * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. * - * Once SEIZE behaviors are completely implemented, this flag and - * the following test will be removed. + * Once SEIZE behaviors are completely implemented, this flag + * will be removed. */ retval = -EIO; - if (seize && !(flags & PTRACE_SEIZE_DEVEL)) - goto out; + if (seize) { + if (addr != 0) + goto out; + if (!(flags & PTRACE_SEIZE_DEVEL)) + goto out; + flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL; + if (flags & ~(unsigned long)PTRACE_O_MASK) + goto out; + flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); + } else { + flags = PT_PTRACED; + } audit_ptrace(task); @@ -282,11 +293,11 @@ static int ptrace_attach(struct task_struct *task, long request, if (task->ptrace) goto unlock_tasklist; - task->ptrace = PT_PTRACED; if (seize) - task->ptrace |= PT_SEIZED; + flags |= PT_SEIZED; if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) - task->ptrace |= PT_PTRACE_CAP; + flags |= PT_PTRACE_CAP; + task->ptrace = flags; __ptrace_link(task, current); @@ -879,7 +890,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -1022,7 +1033,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. -- cgit From ee00560c7dac1dbbf048446a8489550d0a5765b7 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:43 -0700 Subject: ptrace: remove PTRACE_SEIZE_DEVEL bit PTRACE_SEIZE code is tested and ready for production use, remove the code which requires special bit in data argument to make PTRACE_SEIZE work. Strace team prepares for a new release of strace, and we would like to ship the code which uses PTRACE_SEIZE, preferably after this change goes into released kernel. Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4661c5bc07e5..ee8d49b9c309 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -237,25 +237,10 @@ static int ptrace_attach(struct task_struct *task, long request, bool seize = (request == PTRACE_SEIZE); int retval; - /* - * SEIZE will enable new ptrace behaviors which will be implemented - * gradually. SEIZE_DEVEL bit is used to prevent applications - * expecting full SEIZE behaviors trapping on kernel commits which - * are still in the process of implementing them. - * - * Only test programs for new ptrace behaviors being implemented - * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. - * - * Once SEIZE behaviors are completely implemented, this flag - * will be removed. - */ retval = -EIO; if (seize) { if (addr != 0) goto out; - if (!(flags & PTRACE_SEIZE_DEVEL)) - goto out; - flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL; if (flags & ~(unsigned long)PTRACE_O_MASK) goto out; flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); -- cgit From 629d362b9950166c6fac2aa8425db34d824bb043 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:44 -0700 Subject: signal: give SEND_SIG_FORCED more power to beat SIGNAL_UNKILLABLE force_sig_info() and friends have the special semantics for synchronous signals, this interface should not be used if the target is not current. And it needs the fixes, in particular the clearing of SIGNAL_UNKILLABLE is not exactly right. However there are callers which have to use force_ exactly because it clears SIGNAL_UNKILLABLE and thus it can kill the CLONE_NEWPID tasks, although this is almost always is wrong by various reasons. With this patch SEND_SIG_FORCED ignores SIGNAL_UNKILLABLE, like we do if the signal comes from the ancestor namespace. This makes the naming in prepare_signal() paths insane, fixed by the next cleanup. Note: this only affects SIGKILL/SIGSTOP, but this is enough for force_sig() abusers. Signed-off-by: Oleg Nesterov Cc: Tejun Heo Cc: Anton Vorontsov Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e76001ccf5cd..2584f5a91fbe 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1059,7 +1059,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, assert_spin_locked(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, from_ancestor_ns)) + if (!prepare_signal(sig, t, + from_ancestor_ns || (info == SEND_SIG_FORCED))) goto ret; pending = group ? &t->signal->shared_pending : &t->pending; -- cgit From def8cf72562e17ec8316ce0cb5697c7afd6400e3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:45 -0700 Subject: signal: cosmetic, s/from_ancestor_ns/force/ in prepare_signal() paths Cosmetic, rename the from_ancestor_ns argument in prepare_signal() paths. After the previous change it doesn't match the reality. Signed-off-by: Oleg Nesterov Cc: Tejun Heo Cc: Anton Vorontsov Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2584f5a91fbe..d523da02dd14 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -58,21 +58,20 @@ static int sig_handler_ignored(void __user *handler, int sig) (handler == SIG_DFL && sig_kernel_ignore(sig)); } -static int sig_task_ignored(struct task_struct *t, int sig, - int from_ancestor_ns) +static int sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !from_ancestor_ns) + handler == SIG_DFL && !force) return 1; return sig_handler_ignored(handler, sig); } -static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) +static int sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the @@ -82,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, from_ancestor_ns)) + if (!sig_task_ignored(t, sig, force)) return 0; /* @@ -855,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t) * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ -static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) +static int prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; @@ -915,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) } } - return !sig_ignored(p, sig, from_ancestor_ns); + return !sig_ignored(p, sig, force); } /* @@ -1602,7 +1601,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) ret = 1; /* the signal is ignored */ result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, 0)) + if (!prepare_signal(sig, t, false)) goto out; ret = 0; -- cgit From a02d6fd643cbd4c559113b35b31d3b04e4ec60c7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:46 -0700 Subject: signal: zap_pid_ns_processes: s/SEND_SIG_NOINFO/SEND_SIG_FORCED/ Change zap_pid_ns_processes() to use SEND_SIG_FORCED, it looks more clear compared to SEND_SIG_NOINFO which relies on from_ancestor_ns logic send_signal(). It is also more efficient if we need to kill a lot of tasks because it doesn't alloc sigqueue. While at it, add the __fatal_signal_pending(task) check as a minor optimization. Signed-off-by: Oleg Nesterov Cc: Tejun Heo Cc: Anton Vorontsov Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a8968396046d..17b232869a04 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -168,13 +168,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) while (nr > 0) { rcu_read_lock(); - /* - * Any nested-container's init processes won't ignore the - * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). - */ task = pid_task(find_vpid(nr), PIDTYPE_PID); - if (task) - send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); + if (task && !__fatal_signal_pending(task)) + send_sig_info(SIGKILL, SEND_SIG_FORCED, task); rcu_read_unlock(); -- cgit From b3449922502f5a161ee2b5022a33aec8472fbf18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:47 -0700 Subject: usermodehelper: introduce umh_complete(sub_info) Preparation. Add the new trivial helper, umh_complete(). Currently it simply does complete(sub_info->complete). Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index a0a88543934e..8ea25944ce33 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -199,6 +199,11 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info) } EXPORT_SYMBOL(call_usermodehelper_freeinfo); +static void umh_complete(struct subprocess_info *sub_info) +{ + complete(sub_info->complete); +} + /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -235,7 +240,7 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - complete(sub_info->complete); + umh_complete(sub_info); return 0; } @@ -269,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work) case UMH_WAIT_EXEC: if (pid < 0) sub_info->retval = pid; - complete(sub_info->complete); + umh_complete(sub_info); } } -- cgit From d0bd587a80960d7ba7e0c8396e154028c9045c54 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:47 -0700 Subject: usermodehelper: implement UMH_KILLABLE Implement UMH_KILLABLE, should be used along with UMH_WAIT_EXEC/PROC. The caller must ensure that subprocess_info->path/etc can not go away until call_usermodehelper_freeinfo(). call_usermodehelper_exec(UMH_KILLABLE) does wait_for_completion_killable. If it fails, it uses xchg(&sub_info->complete, NULL) to serialize with umh_complete() which does the same xhcg() to access sub_info->complete. If call_usermodehelper_exec wins, it can safely return. umh_complete() should get NULL and call call_usermodehelper_freeinfo(). Otherwise we know that umh_complete() was already called, in this case call_usermodehelper_exec() falls back to wait_for_completion() which should succeed "very soon". Note: UMH_NO_WAIT == -1 but it obviously should not be used with UMH_KILLABLE. We delay the neccessary cleanup to simplify the back porting. Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 8ea25944ce33..f92f917c450c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -201,7 +201,15 @@ EXPORT_SYMBOL(call_usermodehelper_freeinfo); static void umh_complete(struct subprocess_info *sub_info) { - complete(sub_info->complete); + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); } /* Keventd can't block, but this (a child) can. */ @@ -252,6 +260,9 @@ static void __call_usermodehelper(struct work_struct *work) enum umh_wait wait = sub_info->wait; pid_t pid; + if (wait != UMH_NO_WAIT) + wait &= ~UMH_KILLABLE; + /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ @@ -461,9 +472,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, queue_work(khelper_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; + + if (wait & UMH_KILLABLE) { + retval = wait_for_completion_killable(&done); + if (!retval) + goto wait_done; + + /* umh_complete() will see NULL and free sub_info */ + if (xchg(&sub_info->complete, NULL)) + goto unlock; + /* fallthrough, umh_complete() was already called */ + } + wait_for_completion(&done); +wait_done: retval = sub_info->retval; - out: call_usermodehelper_freeinfo(sub_info); unlock: -- cgit From 9d944ef32e83405a07376f112e9f02161d3e9731 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:48 -0700 Subject: usermodehelper: kill umh_wait, renumber UMH_* constants No functional changes. It is not sane to use UMH_KILLABLE with enum umh_wait, but obviously we do not want another argument in call_usermodehelper_* helpers. Kill this enum, use the plain int. Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index f92f917c450c..8341de91613f 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -257,12 +257,9 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - enum umh_wait wait = sub_info->wait; + int wait = sub_info->wait & ~UMH_KILLABLE; pid_t pid; - if (wait != UMH_NO_WAIT) - wait &= ~UMH_KILLABLE; - /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ @@ -451,8 +448,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, - enum umh_wait wait) +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); int retval = 0; -- cgit From 5b9bd473e3b8a8c6c4ae99be475e6e9b27568555 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:49 -0700 Subject: usermodehelper: ____call_usermodehelper() doesn't need do_exit() Minor cleanup. ____call_usermodehelper() can simply return, no need to call do_exit() explicitely. Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 8341de91613f..685b246b13b0 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -188,7 +188,7 @@ static int ____call_usermodehelper(void *data) /* Exec failed? */ fail: sub_info->retval = retval; - do_exit(0); + return 0; } void call_usermodehelper_freeinfo(struct subprocess_info *info) -- cgit From 3e63a93b987685f02421e18b2aa452d20553a88b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:49 -0700 Subject: kmod: introduce call_modprobe() helper No functional changes. Move the call_usermodehelper code from __request_module() into the new simple helper, call_modprobe(). Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 685b246b13b0..56a29e812ff0 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -60,6 +60,21 @@ static DECLARE_RWSEM(umhelper_sem); */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +static int call_modprobe(char *module_name, int wait) +{ + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + + char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; + + return call_usermodehelper_fns(modprobe_path, argv, envp, + wait, NULL, NULL, NULL); +} + /** * __request_module - try to load a kernel module * @wait: wait (or not) for the operation to complete @@ -81,11 +96,6 @@ int __request_module(bool wait, const char *fmt, ...) char module_name[MODULE_NAME_LEN]; unsigned int max_modprobes; int ret; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; static atomic_t kmod_concurrent = ATOMIC_INIT(0); #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; @@ -128,9 +138,7 @@ int __request_module(bool wait, const char *fmt, ...) trace_module_request(module_name, wait, _RET_IP_); - ret = call_usermodehelper_fns(modprobe_path, argv, envp, - wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, - NULL, NULL, NULL); + ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_dec(&kmod_concurrent); return ret; -- cgit From 1cc684ab75123efe7ff446eb821d44375ba8fa30 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:50 -0700 Subject: kmod: make __request_module() killable As Tetsuo Handa pointed out, request_module() can stress the system while the oom-killed caller sleeps in TASK_UNINTERRUPTIBLE. The task T uses "almost all" memory, then it does something which triggers request_module(). Say, it can simply call sys_socket(). This in turn needs more memory and leads to OOM. oom-killer correctly chooses T and kills it, but this can't help because it sleeps in TASK_UNINTERRUPTIBLE and after that oom-killer becomes "disabled" by the TIF_MEMDIE task T. Make __request_module() killable. The only necessary change is that call_modprobe() should kmalloc argv and module_name, they can't live in the stack if we use UMH_KILLABLE. This memory is freed via call_usermodehelper_freeinfo()->cleanup. Reported-by: Tetsuo Handa Signed-off-by: Oleg Nesterov Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 56a29e812ff0..957a7aab8ebc 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -60,6 +60,12 @@ static DECLARE_RWSEM(umhelper_sem); */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +static void free_modprobe_argv(struct subprocess_info *info) +{ + kfree(info->argv[3]); /* check call_modprobe() */ + kfree(info->argv); +} + static int call_modprobe(char *module_name, int wait) { static char *envp[] = { @@ -69,10 +75,26 @@ static int call_modprobe(char *module_name, int wait) NULL }; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; + char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); + if (!argv) + goto out; + + module_name = kstrdup(module_name, GFP_KERNEL); + if (!module_name) + goto free_argv; + + argv[0] = modprobe_path; + argv[1] = "-q"; + argv[2] = "--"; + argv[3] = module_name; /* check free_modprobe_argv() */ + argv[4] = NULL; return call_usermodehelper_fns(modprobe_path, argv, envp, - wait, NULL, NULL, NULL); + wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); +free_argv: + kfree(argv); +out: + return -ENOMEM; } /** -- cgit