taskset と cgroup は何が違うのか

taskset と cgroupは、あるプロセスが利用可能な CPU コアを制限できる技術。どんな違いがあるのかを調べた。なお、本記事の cgroup は cgroup v1 のこと。

taskset は sched_setaffinity システムコールを利用したコマンドラインツール。sched_setaffinity はユーザが実行したプロセスの CPU Affinity をユーザの権限で変えられる。プロセスからは CPU が存在するけど自発的に使わない状態。(≒このコアじゃないとヤダ！)
cgroup は CPU/メモリなどのリソースを隔離するための仕組み。cgroup は設定権限をファイルのパーミッションで管理し、ユーザからの操作を禁止できる。プロセスからは CPU が存在しない状態。(≒このコアだけ使ってね)
sched_setaffinity と cgroup はユーザへのインタフェースが異なるが、カーネル内部の実装はだいたい同じ(プロセスを管理する構造体(task_struct) の cpus_allowed に利用可能な CPU の情報を設定するだけ)。

※1 CPU Affinity はコアを固定する。過去には「コアを固定するけど他のコアがガラガラだったら使う」というソフトリミットが提案されたようだが(CPU Soft Affinity)、“Sounds like a hard sell.(押し売りじゃない？)“という無慈悲なコメントがついていた。最新のカーネルでも採用されてなかったので、ハードリミットの方針はとうぶん変わらない気がする。

以下、調べたことのメモ。

taskset 使ってみる

設定する

]# taskset -p -c 0 $$
プロセス ID 7862 の現在の親和性リスト: 0,1
プロセス ID 7862 の新しい親和性リスト: 0

]# cat /proc/$$/status | grep Cpus
Cpus_allowed:   1
Cpus_allowed_list:      0

実験する

1 コアにしか負荷がかかっていないのがわかる。

]# stress --cpu 4
stress: info: [8020] dispatching hogs: 4 cpu, 0 io, 0 vm, 0 hdd

]# top
top - 04:19:07 up 2 days, 20:32,  5 users,  load average: 0.24, 0.22, 0.09
Tasks: 128 total,   6 running, 122 sleeping,   0 stopped,   0 zombie
%Cpu0  : 98.3 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  1.7 hi,  0.0 si,  0.0 st
%Cpu1  :  0.0 us,  0.0 sy,  0.0 ni,100.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
MiB Mem :   7717.3 total,    132.6 free,   6615.4 used,    969.3 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.    605.8 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
 8032 root      20   0    7948     96      0 R  25.9   0.0   0:00.61 stress
 8030 root      20   0    7948     96      0 R  24.1   0.0   0:00.61 stress
 8031 root      20   0    7948     96      0 R  24.1   0.0   0:00.60 stress
 8033 root      20   0    7948     96      0 R  24.1   0.0   0:00.60 stress

cgroup 使ってみる

設定する

]# mkdir /sys/fs/cgroup/cpuset/sandbox
]# echo 0 >  /sys/fs/cgroup/cpuset/sandbox/cpuset.cpus
]# echo 0 >  /sys/fs/cgroup/cpuset/sandbox/cpuset.mems
]# echo $$ >> /sys/fs/cgroup/cpuset/sandbox/tasks

]# cat /proc/$$/cgroup
12:pids:/user.slice/user-0.slice/session-9655.scope
11:cpuset:/sandbox
10:perf_event:/
9:devices:/user.slice
8:blkio:/system.slice/sshd.service
7:memory:/user.slice/user-0.slice/session-9655.scope
6:freezer:/
5:cpu,cpuacct:/
4:rdma:/
3:hugetlb:/
2:net_cls,net_prio:/
1:name=systemd:/user.slice/user-0.slice/session-9655.scope

]# cat /proc/$$/status | grep Cpus
Cpus_allowed:   1
Cpus_allowed_list:      0

なお、cgroup が設定されていたとしても /proc/cpuinfo などの情報は変化しない。プロセスが利用できる cpu 情報を取りたい場合は nproc を使う。

]# cat /proc/cpuinfo | grep processor | wc -l
2
]# nproc
1

実験する

1 コアにしか負荷がかかっていないのがわかる。

]# stress --cpu 4
stress: info: [8250] dispatching hogs: 4 cpu, 0 io, 0 vm, 0 hdd

]# top
top - 04:21:34 up 2 days, 20:35,  4 users,  load average: 0.51, 0.34, 0.16
Tasks: 126 total,   5 running, 121 sleeping,   0 stopped,   0 zombie
%Cpu0  : 98.2 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  1.8 hi,  0.0 si,  0.0 st
%Cpu1  :  0.0 us,  0.0 sy,  0.0 ni,100.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
MiB Mem :   7717.3 total,    136.9 free,   6604.4 used,    976.1 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.    613.5 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
 8254 root      20   0    7948     92      0 R  24.8   0.0   0:01.15 stress
 8251 root      20   0    7948     92      0 R  24.3   0.0   0:01.15 stress
 8252 root      20   0    7948     92      0 R  24.3   0.0   0:01.15 stress
 8253 root      20   0    7948     92      0 R  24.3   0.0   0:01.15 stress

実装をながめる

sched_setaffinity

sched_setaffinity は kernel/sched/core.c で実装されている。ユーザが指定した cpumask と cgroup の制限を考慮した CPU (cpus_allowed)から、最終的に許可する cpu のリスト(new_mask) を作成し、task_struct の cpus_allowed に設定する。

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
        ...
    // cgroup で許可されている cpu を cpus_allowed に設定する
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
    ...
again:
        retval = __set_cpus_allowed_ptr(p, new_mask, true);
    ...
}
...
static int __set_cpus_allowed_ptr(struct task_struct *p,
                                  const struct cpumask *new_mask, bool check)
{
        ...
        do_set_cpus_allowed(p, new_mask);
        ...
}
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
        struct rq *rq = task_rq(p);
    ...
        p->sched_class->set_cpus_allowed(p, new_mask);
    ...
}
...
// スケジューラごとに異なるが、大体これが呼ばれる
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
        cpumask_copy(&p->cpus_allowed, new_mask);
        p->nr_cpus_allowed = cpumask_weight(new_mask);
}

cgroup

いろいろすっ飛ばすと、ユーザが cpuset.cpus に書き込んだ値をもとに cgroup の制約(例えばグループは親子階層があって親の制限を超えて子にコアを割り当てられないなど。)に基づくチェックをしつつ、 task_struct の cpus_allowed に、最終的に許可する cpu のリストを設定する。

static struct cftype legacy_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },
...

/*
 * Common handling for a write to a "cpus" or "mems" file.
 */
static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        struct cpuset *cs = css_cs(of_css(of));
        struct cpuset *trialcs;
    ...
        case FILE_CPULIST:
                retval = update_cpumask(cs, trialcs, buf);
                break;
        ...
}
...
/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
 * @trialcs: trial cpuset
 * @buf: buffer of cpu numbers written to this cpuset
 */
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                          const char *buf)
{
        ...

        if (!*buf) {
                cpumask_clear(trialcs->cpus_allowed);
        } else {
                retval = cpulist_parse(buf, trialcs->cpus_allowed);
    ...
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
...
/*
 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
 * @cs:  the cpuset to consider
 * @tmp: temp variables for calculating effective_cpus & partition setup
 *
 * When congifured cpumask is changed, the effective cpumasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
 *
 * Called with cpuset_mutex held
 */
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
        bool need_rebuild_sched_domains = false;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
            // 階層構造に関する処理もろもろ
            ...

                update_tasks_cpumask(cp);
            ...
    }
    ...
/**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 *
 * Iterate through each task of @cs updating its cpus_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
 */
static void update_tasks_cpumask(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
            // この関数以降は sched_setaffinity と同じ
                set_cpus_allowed_ptr(task, cs->effective_cpus);
        css_task_iter_end(&it);
}

権限について

sched_setaffinity

sched_setaffinity の呼び出し元は、 pid で指定した thread のユーザIDと同一ならば良い。つまりユーザが設定を自由に変えられる。

EPERM (sched_setaffinity()) The calling thread does not have appropriate privileges. The caller needs an effective user ID equal to the real user ID or effective user ID of the thread identified by pid, or it must possess the CAP_SYS_NICE capability in the user namespace of the thread pid. 引用元 man sched_setaffinity

デフォルトだとパーミッションが root に設定されている。そのため、一般ユーザは cgroup の操作ができない。権限を設定すればユーザが自由に変えられるが、一般的には、制限されたプロセスからは触らせないようにするはず。Docker も cgroup を使っているらしい( Docker-docs-ja コントロール・グループ)。ただコンテナ系は namespace が分離されててファイルシステムが別なので、そもそもコンテナからホストの cgroupfs はさわれないよねとかあると思う。そのうち調べたい。

]$ ls -al /sys/fs/cgroup/cpuset/tasks
-rw-r--r--. 1 root root 0  6月 26 03:09 /sys/fs/cgroup/cpuset/tasks

]$ id
uid=1002(kimullaa) gid=1002(kimullaa) groups=1002(kimullaa) context=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
]$ echo $$ > /sys/fs/cgroup/cpuset/tasks
-bash: /sys/fs/cgroup/cpuset/tasks: 許可がありません