[RFC] cgroup: Fix /proc/cgroups count for v2

From: T.J. Mercier
Date: Sun May 19 2024 - 13:47:19 EST


The /proc/cgroups documentation says that the num_cgroups value is,
"the number of control groups in this hierarchy using this controller."

The value printed is simply the total number of cgroups in the hierarchy
which is correct for v1, but not for the shared v2 hierarchy.

Consider:
controllers="cpuset cpu io memory hugetlb pids rdma misc"
for c in $controllers
do
echo +$c > /sys/fs/cgroup/cgroup.subtree_control
mkdir /sys/fs/cgroup/$c
echo +$c > /sys/fs/cgroup/$c/cgroup.subtree_control
for i in `seq 100`; do mkdir /sys/fs/cgroup/$c/$i; done
done
cat /proc/cgroups

#subsys_name hierarchy num_cgroups enabled
cpuset 0 809 1
cpu 0 809 1
cpuacct 0 809 1
blkio 0 809 1
memory 0 809 1
devices 0 809 1
freezer 0 809 1
net_cls 0 809 1
perf_event 0 809 1
net_prio 0 809 1
hugetlb 0 809 1
pids 0 809 1
rdma 0 809 1
misc 0 809 1
debug 0 809 1

A count of 809 is reported for each controller, but only 109 should be
reported for most of them since each controller is enabled in only part
of the hierarchy. (Note that io depends on memcg, so its count should be
209.)

The number of cgroups using a controller is an important metric since
kernel memory is used for each cgroup, and some kernel operations scale
with the number of cgroups for some controllers (memory, io). So users
have an interest in minimizing/tracking the number of them.

- - - - - - - - - -

Why is this functional patch currently a RFC:
The point at which the new counters are incremented/decremented for most
enumerated v2 controllers works fine. However for some controllers (the
v2 documentation calls them "utility controllers") online_css and
kill_css are never called: cpuacct, devices, freezer, net_cls, net_prio,
debug.

To deal with num_cgroups being reported as 1 for those utility
controllers regardless of the number of cgroups that exist and support
their use, I added is_v2_utility_controller which checks if a controller
is among a hardcoded list instead of looking at some property of the
cgroup_subsys since I don't think any such property currently exists.
It'd be easy to miss adding a new utility controller to this list, so
I am interested in hearing if folks have other ideas. I checked if I
could use the presence of online_css in cgroup_subsys, but that only
works for cpuacct and debug.
---
include/linux/cgroup-defs.h | 6 ++++++
kernel/cgroup/cgroup-v1.c | 36 ++++++++++++++++++++++++++++++++++--
kernel/cgroup/cgroup.c | 4 ++++
3 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ea48c861cd36..400311222337 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -579,6 +579,12 @@ struct cgroup_root {
/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
atomic_t nr_cgrps;

+ /*
+ * Number of cgroups using each controller. Includes online and zombies.
+ * Used only for v2 controllers in /proc/cgroups.
+ */
+ atomic_t nr_css[CGROUP_SUBSYS_COUNT];
+
/* Hierarchy-specific flags */
unsigned int flags;

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 520a11cb12f4..8146bcc31421 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -663,6 +663,30 @@ struct cftype cgroup1_base_files[] = {
{ } /* terminate */
};

+static bool is_v2_utility_controller(int ssid)
+{
+ return
+#ifdef CONFIG_CGROUP_CPUACCT
+ ssid == cpuacct_cgrp_id ||
+#endif
+#ifdef CONFIG_CGROUP_DEVICE
+ ssid == devices_cgrp_id ||
+#endif
+#ifdef CONFIG_CGROUP_FREEZER
+ ssid == freezer_cgrp_id ||
+#endif
+#ifdef CONFIG_NET_CLS_CGROUP
+ ssid == net_cls_cgrp_id ||
+#endif
+#ifdef CONFIG_CGROUP_NET_PRIO
+ ssid == net_prio_cgrp_id ||
+#endif
+#ifdef CONFIG_CGROUP_DEBUG
+ ssid == debug_cgrp_id ||
+#endif
+ false;
+}
+
/* Display information about each subsystem and each hierarchy */
int proc_cgroupstats_show(struct seq_file *m, void *v)
{
@@ -675,11 +699,19 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
* cgroup_mutex contention.
*/

- for_each_subsys(ss, i)
+ for_each_subsys(ss, i) {
+ int count;
+
+ if (!cgroup_on_dfl(&ss->root->cgrp) || is_v2_utility_controller(i))
+ count = atomic_read(&ss->root->nr_cgrps);
+ else
+ count = atomic_read(&ss->root->nr_css[i]);
+
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
- atomic_read(&ss->root->nr_cgrps),
+ count,
cgroup_ssid_enabled(i));
+ }

return 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a66c088c851c..f25d0e77ae8a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2047,6 +2047,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)

INIT_LIST_HEAD_RCU(&root->root_list);
atomic_set(&root->nr_cgrps, 1);
+ for (int i = 0; i < CGROUP_SUBSYS_COUNT; ++i)
+ atomic_set(&root->nr_css[i], 0);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);

@@ -5362,6 +5364,7 @@ static void css_free_rwork_fn(struct work_struct *work)
ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
+ atomic_dec(&ss->root->nr_css[ss->id]);

if (parent)
css_put(parent);
@@ -5503,6 +5506,7 @@ static int online_css(struct cgroup_subsys_state *css)
atomic_inc(&css->online_cnt);
if (css->parent)
atomic_inc(&css->parent->online_cnt);
+ atomic_inc(&ss->root->nr_css[ss->id]);
}
return ret;
}
--
2.45.0.rc1.225.g2a3ae87e7f-goog