More than 5 years have passed since last update.

cpu.sharesの内部動作

Posted at 2018-07-30

これは何

linuxのリソース制御機能であるcgroupのcpuサブシステムのパラメータの一つである、cpu.sharesへの設定変更がカーネル内でどのように参照されていくかを調べたメモ.
間違ってるところもあると思うのでツッコミ大歓迎ですm(_ _)m

結論

cpu.sharesに値をセットすると，当該プロセスグループに対するCFSの赤黒木アルゴリズムの優先度(load_weight)が変更される．
- cpu.shares->cpu_shares_write_u64->sched_group_set_shares->update_cfs_group->reweight_entity->update_load_setの順に値が渡されてセットされる．
load_weightは従来のlinuxにおけるプロセスの優先度制御パラメータであるNice値を置き換えるもので，load_weight設定後の動作はスケジューラ(CFS)の動作に準じる（cpu.sharesによって挙動は変更されていない・・と思われる）

ソースコード解析

前提

CONFIG_CGROUPS=true
CONFIG_CGROUP_SCHEDはCPUサブシステムのon/offなのでtrue
- linux/include/linux/cgroup_subsys.h L16
```
#if IS_ENABLED(CONFIG_CGROUP_SCHED)
SUBSYS(cpu)
#endif
```
CONFIG_RT_GROUP_SCHEDはrealtime scheduler関連なので読み捨て
CONFIG_FAIR_GROUP_SCHEDはtrue[4]

When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each group created using the pseudo filesystem. See example steps below to create task groups and modify their CPU share using the "cgroups" pseudo filesystem.
CONFIG_FAIR_GROUP_SCHEDが定義されている時は、擬似ファイルシステムを使って作られたグループごとに"cpu.shares"というファイルが作られています。それでは、"cgroups"擬似ファイルシステムを使ってタスクグループを作ったりそれらのCPU時間の分け前を変えたりするステップの例を見てみましょう
CONFIG_CFS_BANDWIDTHはcpuの消費量を制御するものなので読み捨て

(1) `cpu.shares`の呼び出し

special file sharesの動作規定．書き込みの際に，cpu_shares_write_u64を実行している．
kernel/sched/core.c L6819

static struct cftype cpu_legacy_files[] = {
# ifdef CONFIG_FAIR_GROUP_SCHED
	{
		.name = "shares",
		.read_u64 = cpu_shares_read_u64,
		.write_u64 = cpu_shares_write_u64,
	},
# endif

(2) `cpu_shares_write_u64`

sched_group_set_sharesを呼び出し．設定対象のcgroupのtask group構造体にsharesの新たな値をセット．
- css_tg(css)は，設定対象のcgroup(cgroup_subsys_state構造体css)のtask group構造体を参照するメソッド．
kernel/sched/core.c L6527

# ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
				struct cftype *cftype, u64 shareval)
{
	return sched_group_set_shares(css_tg(css), scale_load(shareval));
}

(2-1) struct `task_group`

task_groupは2.6.24で追加された，タスクをグループ化するための構造体．[3]
sched_entity構造体と,cfs_rq構造体,sharesが内部で保持される．
kernel/sched/sched.h L352

struct task_group {
	struct cgroup_subsys_state css;

# ifdef CONFIG_FAIR_GROUP_SCHED
	/* schedulable entities of this group on each CPU */
	struct sched_entity	**se;
	/* runqueue "owned" by this group on each CPU */
	struct cfs_rq		**cfs_rq;
	unsigned long		shares;

# ifdef	CONFIG_SMP
	/*
	 * load_avg can be heavily contended at clock tick time, so put
	 * it in its own cacheline separated from the fields above which
	 * will also be accessed at each tick.
	 */
	atomic_long_t		load_avg ____cacheline_aligned;
# endif
# endif

/* 中略 */

};

(2-2) struct `sched_entity`

load_weight構造体の変数loadを持つ．
- load_weightは優先度に基づいた重みが格納される構造体[8]．
linux/include/linux/sched.h L446

struct sched_entity {
	/* For load-balancing: */
	struct load_weight		load;
	unsigned long			runnable_weight;

/* 中略 */

};

(2-3) struct `cfs_rq`

load_weight構造体の変数loadを持つ．（sched_entityと同様）
kernel/sched/shced.h L477

/* CFS-related fields in a runqueue */
struct cfs_rq {
	struct load_weight	load;
	unsigned long		runnable_weight;
    
/* 中略 */
    
	struct {
		raw_spinlock_t	lock ____cacheline_aligned;
		int		nr;
		unsigned long	load_avg;
		unsigned long	util_avg;
		unsigned long	runnable_sum;
	} removed;

/* 中略 */

};

(2-4) struct `load_weight`

load_weightはweightとその逆数の2値を持つ構造体．
include/linux/sched.h L313

struct load_weight {
	unsigned long			weight;
	u32				inv_weight;
};

[8]より抜粋(load.weightはload_weightの誤記だと思う)

処理
ここからはプロセスの一生を通じてどのようにスケジューリングされるかおおまかに追っていきます。

まずプロセスが生成されると、スケジューリング関連の値はsched_fork()の中で初期化されます。そのあとtask_fork_fair()の中でsched_entityのvruntimeの値は親のvruntimeの値になります。しかしその後place_entity() が呼ばれ、vruntimeの値はそれまでのvruntimeの値に１回実行権が与えられたときの実行時間が足されます。これにより、次々とforkが起こる事で新たに作られたプロセスばかりが実行されないようになります。 (task_fork_fair()の中でvruntimeからcfs_rq.min_vruntimeの値が引かれていますが、これはまたenqueue_entity()のなかで足されて戻ります。おそらく眠りから覚めたプロセスと扱いを揃えるため？)

また、sched_entity中のload.weightがプロセスのstatic_prioに基づいてset_load_weight()で初期化されます。実際の重みは優先度ごとにあらかじめ値が決められています。(prio_to_weight[]) 重みは優先度が高いほど大きく、優先度が低いほど小さく定められています。式で書くと weight = 1.25^(-nice) * 1024ぐらいみたいです。最後にdo_fork()からwake_up_new_task()が呼ばれ、プロセスが赤黒木に入ります。

(3) `sched_group_set_shares`

第１引数で指定されたtask_group構造体に対し，以下を実行
- tg->sharesにsharesの新しい値をセット．
- 第１引数で指定されたtask_groupが管理する全てのsched_entity構造体（sched_entityはCPU毎のため複数存在）に対してupdate_cfs_groupを実行．
kernel/sched/fair.c L10328

int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
	int i;

	/*
	 * We can't change the weight of the root cgroup.
	 */
	if (!tg->se[0])
		return -EINVAL;

	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

	mutex_lock(&shares_mutex);
	if (tg->shares == shares)
		goto done;

	tg->shares = shares; # 第１引数`tg->shares`に第２引数を代入
	for_each_possible_cpu(i) {
		struct rq *rq = cpu_rq(i);
		struct sched_entity *se = tg->se[i];
		struct rq_flags rf;

		/* Propagate contribution to hierarchy */
		rq_lock_irqsave(rq, &rf);
		update_rq_clock(rq);
		for_each_sched_entity(se) {
			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
			update_cfs_group(se);
		}
		rq_unlock_irqrestore(rq, &rf);
	}

done:
	mutex_unlock(&shares_mutex);
	return 0;
}

(4)`update_cfs_group`

reweight_entityを呼び出し．　＃ CONFIG_SMP=falseの場合，calc_group_sharesで何か値を出してるようだが，割愛．
kernel/sched/fair.c L3007

/*
 * Recomputes the group entity based on the current state of its group
 * runqueue.
 */
static void update_cfs_group(struct sched_entity *se)
{
	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
	long shares, runnable;

	if (!gcfs_rq)
		return;

	if (throttled_hierarchy(gcfs_rq))
		return;

# ifndef CONFIG_SMP
	runnable = shares = READ_ONCE(gcfs_rq->tg->shares);

	if (likely(se->load.weight == shares))
		return;
# else
	shares   = calc_group_shares(gcfs_rq);
	runnable = calc_group_runnable(gcfs_rq, shares);
# endif

	reweight_entity(cfs_rq_of(se), se, shares, runnable);
}

(5)`reweight_entity`

update_load_setにて，se->loadにsharesをセットしている．
[TODO1] cfs_rq->loadには値を設定しているのかよくわからず．
[TODO2] runnable_weightの意味はよくわかっていない．
kernel/sched/fair.c #L2808

static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
			    unsigned long weight, unsigned long runnable)
{
	if (se->on_rq) {
		/* commit outstanding execution time */
		if (cfs_rq->curr == se)
			update_curr(cfs_rq);
		account_entity_dequeue(cfs_rq, se);
		dequeue_runnable_load_avg(cfs_rq, se);
	}
	dequeue_load_avg(cfs_rq, se);

	se->runnable_weight = runnable;
	update_load_set(&se->load, weight);

# ifdef CONFIG_SMP
	do {
		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;

		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
		se->avg.runnable_load_avg =
			div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
	} while (0);
# endif

	enqueue_load_avg(cfs_rq, se);
	if (se->on_rq) {
		account_entity_enqueue(cfs_rq, se);
		enqueue_runnable_load_avg(cfs_rq, se);
	}
}

(6)`update_load_set`

kernel/sched/fair.c L133

static inline void update_load_set(struct load_weight *lw, unsigned long w)
{
	lw->weight = w;
	lw->inv_weight = 0;
}

参考文献

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

cpu.sharesの内部動作

これは何

結論

ソースコード解析

前提

(1) cpu.sharesの呼び出し

(2) cpu_shares_write_u64

(2-1) struct task_group

(2-2) struct sched_entity

(2-3) struct cfs_rq

(2-4) struct load_weight

(3) sched_group_set_shares

(4)update_cfs_group

(5)reweight_entity

(6)update_load_set

参考文献

(1) `cpu.shares`の呼び出し

(2) `cpu_shares_write_u64`

(2-1) struct `task_group`

(2-2) struct `sched_entity`

(2-3) struct `cfs_rq`

(2-4) struct `load_weight`

(3) `sched_group_set_shares`

(4)`update_cfs_group`

(5)`reweight_entity`

(6)`update_load_set`