workqueue: update sysfs interface to reflect NUMA awareness and a kernel param to disable NUMA affinity

Unbound workqueues are now NUMA aware. Let's add some control knobs and update sysfs interface accordingly. * Add kernel param workqueue.numa_disable which disables NUMA affinity globally. * Replace sysfs file "pool_id" with "pool_ids" which contain node:pool_id pairs. This change is userland-visible but "pool_id" hasn't seen a release yet, so this is okay. * Add a new sysf files "numa" which can toggle NUMA affinity on individual workqueues. This is implemented as attrs->no_numa whichn is special in that it isn't part of a pool's attributes. It only affects how apply_workqueue_attrs() picks which pools to use. After "pool_ids" change, first_pwq() doesn't have any user left. Removed. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
2025-06-22 06:32:08 +00:00 · 2013-04-01 11:23:38 -07:00 · 2013-04-01 11:23:38 -07:00 · d55262c4d1
commit d55262c4d1
parent 4c16bd327c
3 changed files with 73 additions and 23 deletions
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@ -3222,6 +3222,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			or other driver-specific files in the
 			Documentation/watchdog/ directory.
 	workqueue.disable_numa
 			By default, all work items queued to unbound
 			workqueues are affine to the NUMA nodes they're
 			issued on, which results in better behavior in
 			general.  If NUMA affinity needs to be disabled for
 			whatever reason, this option can be used.  Note
 			that this also can be controlled per-workqueue for
 			workqueues visible under /sys/bus/workqueue/.
 	x2apic_phys	[X86-64,APIC] Use x2apic physical mode instead of
 			default x2apic cluster mode on platforms
 			supporting x2apic.
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@ -119,10 +119,15 @@ struct delayed_work {
 /*
 * A struct for workqueue attributes.  This can be used to change
 * attributes of an unbound workqueue.
 *
 * Unlike other fields, ->no_numa isn't a property of a worker_pool.  It
 * only modifies how apply_workqueue_attrs() select pools and thus doesn't
 * participate in pool hash calculations or equality comparisons.
 */
 struct workqueue_attrs {
 	int			nice;		/* nice level */
 	cpumask_var_t		cpumask;	/* allowed CPUs */
 	bool			no_numa;	/* disable NUMA affinity */
 };
 static inline struct delayed_work *to_delayed_work(struct work_struct *work)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@ -268,6 +268,9 @@ static int wq_numa_tbl_len;		/* highest possible NUMA node id + 1 */
 static cpumask_var_t *wq_numa_possible_cpumask;
 					/* possible CPUs of each node */
 static bool wq_disable_numa;
 module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
 /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@ -516,21 +519,6 @@ static int worker_pool_assign_id(struct worker_pool *pool)
 	return ret;
 }
 /**
 * first_pwq - return the first pool_workqueue of the specified workqueue
 * @wq: the target workqueue
 *
 * This must be called either with wq->mutex held or sched RCU read locked.
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
 */
 static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
 {
 	assert_rcu_or_wq_mutex(wq);
 	return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue,
 				      pwqs_node);
 }
 /**
 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
 * @wq: the target workqueue
@ -3114,16 +3102,21 @@ static struct device_attribute wq_sysfs_attrs[] = {
 	__ATTR_NULL,
 };
-static ssize_t wq_pool_id_show(struct device *dev,
+static ssize_t wq_pool_ids_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
+				struct device_attribute *attr, char *buf)
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct worker_pool *pool;
+	const char *delim = "";
-	int written;
+	int node, written = 0;
 	rcu_read_lock_sched();
-	pool = first_pwq(wq)->pool;
+	for_each_node(node) {
-	written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id);
+		written += scnprintf(buf + written, PAGE_SIZE - written,
 				     "%s%d:%d", delim, node,
 				     unbound_pwq_by_node(wq, node)->pool->id);
 		delim = " ";
 	}
 	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
 	rcu_read_unlock_sched();
 	return written;
@ -3212,10 +3205,46 @@ static ssize_t wq_cpumask_store(struct device *dev,
 	return ret ?: count;
 }
 static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
 	int written;
 	mutex_lock(&wq->mutex);
 	written = scnprintf(buf, PAGE_SIZE, "%d\n",
 			    !wq->unbound_attrs->no_numa);
 	mutex_unlock(&wq->mutex);
 	return written;
 }
 static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t count)
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
 	struct workqueue_attrs *attrs;
 	int v, ret;
 	attrs = wq_sysfs_prep_attrs(wq);
 	if (!attrs)
 		return -ENOMEM;
 	ret = -EINVAL;
 	if (sscanf(buf, "%d", &v) == 1) {
 		attrs->no_numa = !v;
 		ret = apply_workqueue_attrs(wq, attrs);
 	}
 	free_workqueue_attrs(attrs);
 	return ret ?: count;
 }
 static struct device_attribute wq_sysfs_unbound_attrs[] = {
-	__ATTR(pool_id, 0444, wq_pool_id_show, NULL),
+	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
 	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
 	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
 	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
 	__ATTR_NULL,
 };
@ -3750,7 +3779,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
 				 int cpu_going_down, cpumask_t *cpumask)
 {
-	if (!wq_numa_enabled)
+	if (!wq_numa_enabled || attrs->no_numa)
 		goto use_dfl;
 	/* does @node have any online CPUs @attrs wants? */
@ -3951,6 +3980,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 	cpumask = target_attrs->cpumask;
 	mutex_lock(&wq->mutex);
 	if (wq->unbound_attrs->no_numa)
 		goto out_unlock;
 	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
 	pwq = unbound_pwq_by_node(wq, node);
@ -4763,6 +4794,11 @@ static void __init wq_numa_init(void)
 	if (num_possible_nodes() <= 1)
 		return;
 	if (wq_disable_numa) {
 		pr_info("workqueue: NUMA affinity support disabled\n");
 		return;
 	}
 	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
 	BUG_ON(!wq_update_unbound_numa_attrs_buf);