More than 5 years have passed since last update.

stop_machine()の実装を読む

Posted at 2019-05-08

お久しぶりです。
@satoru_takeuchi さんのシステムを一時的に停止させるカーネルモジュールという良い記事を見つけました。
仕事でソースあさっていたとき、stop_machine()を見つけてちょっと厨二病くすぐられたけど、進捗に追われて「後で読もう」のままになってたのを思い出しました。
興味がわいたので、読んでみることにしました。なお、Linuxカーネルのバージョン5.0.13のソースを読み、引用しています。

stop_machine()の実装概要とデータ構造

stop_machine()に関連するデータ構造はおおよそ以下の通りです。以下worksはキューで、stop_machine()に実行させたい処理および関連する各種パラメータをworkに詰めてキューイングします。
そして、CPUごとに存在するスレッドがworksキューからworkを取り出し、stop_machine()で要求した処理を実行します。

stop_machine()が面白いのは、スレッドがバラバラのタイミングで要求された処理を実行するのでなく、「足並みをそろえて」stop_machine()で要求された処理を実行する点にあります。これによって、「それぞれのCPUの割り込みは禁止され、プリエンプトできない状態になっている」状態を保ちつつ、指定されたCPU群上で要求された処理を実行する流れになっているのです。

この説明でわかる人はきっといないと思いますので、ソースを読みながら明らかにしましょう。

実装を読む

初期化

初期化時、先の概念図に現れたスレッドを生成します。

kernel/stop_machine.c

static int __init cpu_stop_init(void)
{
    unsigned int cpu;

    for_each_possible_cpu(cpu) {
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);

        raw_spin_lock_init(&stopper->lock);
        // stop_machineによる要求を受け付けるキューの初期化
        INIT_LIST_HEAD(&stopper->works);
    }

    BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
    stop_machine_unpark(raw_smp_processor_id());
    stop_machine_initialized = true;
    return 0;
}
early_initcall(cpu_stop_init);

スレッドを作る処理はsmpboot_register_percpu_thread()です。BUG_ONマクロの引数内にあるのでわかりにくいかもしれません。
smpboot_register_percpu_thread()は最終的に以下のコードをCPU数分だけ呼び出します。このコードによって、スレッドの本体は、smpboot_thread_fn()だとわかります。

kernel/smpboot.c

static int
__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
// 略
    tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
                    ht->thread_comm);

stop_machine()による処理依頼

次に、stop_machine()がworksキューに依頼をキューイングするまでの流れを追います。
stop_machine()の処理のほとんどを行っている関数が、stop_machine_cpuslocked()です。以下set_state()およびmsdataはとても重要ですが、後ほど説明します。

kernel/stop_machine.c

int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
                const struct cpumask *cpus)
{
    // 先の図にあるmsdataはこれです。詳しくは後ほど説明します。
    // なお、stop_machine()で指定された関数ポインタ(実行させたい処理)はmsdataのfnメンバに設定されます。
    // この点を頭の片隅に置いてください。
    struct multi_stop_data msdata = {
        .fn = fn,
        .data = data,
        .num_threads = num_online_cpus(),
        .active_cpus = cpus,
    };

    // 略。set_state()は重要な役割を演じますが、ここでは以下英語のコメント通り、「初期状態にする」
    // とだけ押さえてください。詳しくは後ほど。
    /* Set the initial state and stop all online cpus. */
    set_state(&msdata, MULTI_STOP_PREPARE);
    return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
}

その後、__stop_cpus()を呼び、CPUごとのスレッドに依頼を行い、依頼の完了を待ちます。なお、上コードのstop_cpus()呼び出しをよく見るとわかるのですが、stop_cpus()に渡している関数ポインタは、stop_machine()に渡した関数ポインタでなく、multi_cpu_stop()をいう関数を指しています。 multi_cpu_stop()については後ほど説明します。

kernel/stop_machine.c

static int __stop_cpus(const struct cpumask *cpumask,
               cpu_stop_fn_t fn, void *arg)
{
    struct cpu_stop_done done;

    cpu_stop_init_done(&done, cpumask_weight(cpumask));
    // CPUごとのスレッドに、stop_machineで要求された処理を依頼する
    if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
        return -ENOENT;
    // 依頼した処理の完了を待つ
    wait_for_completion(&done.completion);
    return done.ret;
}

依頼を出す処理をもう少し詳しく

依頼を出す処理queue_stop_cpus_work()は以下の実装です。以下のように、__cpu_stop_queue_work()を呼び、「依頼」を表現したworkをCPUごとに存在するstopper->worksキューにつなぎます。その上で、wake_up_q()を呼び出し、CPUごとのスレッドを起こします。
なお、workのfnに設定する関数ポインタはmulti_cpu_stop()を指しています。ここを注意してください。

kernel/stop_machine.c

static bool queue_stop_cpus_work(const struct cpumask *cpumask,
                 cpu_stop_fn_t fn, void *arg,
                 struct cpu_stop_done *done)
{
    // 略
    // CPUごとのスレッド全てに依頼を行う。
    for_each_cpu(cpu, cpumask) {
        work = &per_cpu(cpu_stopper.stop_work, cpu);
        // このfnはstop_machine()に渡された処理ではなく、multi_cpu_stop()です。ここをお間違えないように。
        work->fn = fn;
        work->arg = arg;
        work->done = done;
        if (cpu_stop_queue_work(cpu, work))
            queued = true;
    }
    // 略

cpu_stop_queue_work()および__cpu_stop_queue_work()は以下の通りです。そんなに難しい処理ではないので、興味のある方は読んでください。

kernel/stop_machine.c

static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
                    struct cpu_stop_work *work,
                    struct wake_q_head *wakeq)
{
    list_add_tail(&work->list, &stopper->works);
    wake_q_add(wakeq, stopper->thread);
}

/* queue @work to @stopper.  if offline, @work is completed immediately */
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
// 略
    enabled = stopper->enabled;
    if (enabled)
        __cpu_stop_queue_work(stopper, work, &wakeq);
// 略
    wake_up_q(&wakeq);
// 略

スレッドによる処理実行

先に書いたとおり、初期化時にCPUごとにスレッドを生成します。そのスレッドは以下のとおりです。stop_machine()の実装を理解するためには、以下のとおり関数ポインタを経由して処理呼び出しをすることだけわかれば十分かと思います。

kernel/smpboot.c

static int smpboot_thread_fn(void *data)
{
    struct smpboot_thread_data *td = data;
    struct smp_hotplug_thread *ht = td->ht;

    while (1) {
// 略
        } else {
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            ht->thread_fn(td->cpu);
        }
// 略

なお、**ht->thread_fnが指す関数は、multi_cpu_stop()**です。この時点では、stop_machine()に渡された関数は呼ばれません。
次にmulti_cpu_stop()を見ましょう。ここがstop_machine()一番の山場だと思います。

kernel/stop_machine.c

static int multi_cpu_stop(void *data)
{
    struct multi_stop_data *msdata = data;
    enum multi_stop_state curstate = MULTI_STOP_NONE;
    int cpu = smp_processor_id(), err = 0;
    unsigned long flags;
    bool is_active;

    /*
     * When called from stop_machine_from_inactive_cpu(), irq might
     * already be disabled.  Save the state and restore it on exit.
     */
    local_save_flags(flags);

    if (!msdata->active_cpus)
        is_active = cpu == cpumask_first(cpu_online_mask);
    else
        is_active = cpumask_test_cpu(cpu, msdata->active_cpus);

    /* Simple state machine */
    do {
        /* Chill out and ensure we re-read multi_stop_state. */
        cpu_relax_yield();
        if (msdata->state != curstate) {
            curstate = msdata->state;
            switch (curstate) {
            case MULTI_STOP_DISABLE_IRQ:
                local_irq_disable();
                hard_irq_disable();
                break;
            case MULTI_STOP_RUN:
                if (is_active)
                    err = msdata->fn(msdata->data);
                break;
            default:
                break;
            }
            ack_state(msdata);
        } else if (curstate > MULTI_STOP_PREPARE) {
            /*
             * At this stage all other CPUs we depend on must spin
             * in the same loop. Any reason for hard-lockup should
             * be detected and reported on their side.
             */
            touch_nmi_watchdog();
        }
    } while (curstate != MULTI_STOP_EXIT);

    local_irq_restore(flags);
    return err;
}

ここで疑問が一つ現れると思います。「各スレッドがバラバラのタイミングで実行された場合、あるCPUに対応したスレッドが依頼した処理をしている一方、別のCPUに対応したスレッドが割り込み禁止処理を完了していないこともありえる。これにより、別CPUの割り込み起因の処理が、依頼した処理に影響する可能性もある。それはどう防ぐのか。」という点です。

それを以降で説明します。

「足並みをそろえる」

足並みをそろえるための処理がack_state()です。ここを理解するには、set_state()とmsdataを見る必要があります。

kernel/stop_machine.c

static void ack_state(struct multi_stop_data *msdata)
{
    // 1回呼ばれるごとにthread_ackをデクリメントし、0になったら、set_state()を呼ぶ
    if (atomic_dec_and_test(&msdata->thread_ack))
        // set_stateでは、state + 1、つまり次の状態に進める。
        set_state(msdata, msdata->state + 1);
}

set_state()は以下の実装になっています。状態の遷移とともに、msdata->thread_ackにmsdata->num_threadsの値をセットしています。

kernel/stop_machine.c

static void set_state(struct multi_stop_data *msdata,
              enum multi_stop_state newstate)
{
    /* Reset ack counter. */
    atomic_set(&msdata->thread_ack, msdata->num_threads);
    smp_wmb();
    msdata->state = newstate;
}

念のため、msdataを再度引用します。num_threadsには、CPU数をセットしています。よって、上に引用したset_state()では、状態の遷移とともに、実はカウンタとして扱われるthread_ackをCPU数でリセットしていると言えます。

kernel/stop_machine.c

    struct multi_stop_data msdata = {
        .fn = fn,
        .data = data,
        .num_threads = num_online_cpus(),
        .active_cpus = cpus,
    };

ここでもう一度、multi_cpu_stop()の以下実装を見ましょう。curstateからmsdataに記録されている状態が異なる場合にだけifステートメントの処理が実行され、curstateがmsdata->stateの値で上書きされ、ack_state()が呼ばれます。
ここで、curstateがmsdata->stateで上書きされるため、msdata->stateの値が変わる、つまり次の状態遷移が起こるまで該当スレッドではack_state()が呼ばれません。
そして、全スレッドがack_state()を呼び出したとき、thread_ackの値が0となり、ここではじめてset_state()を経由して状態が遷移し、thread_ackの値がリセットされます。

kernel/stop_machine.c

    enum multi_stop_state curstate = MULTI_STOP_NONE;
// 略
    /* Simple state machine */
    do {
        /* Chill out and ensure we re-read multi_stop_state. */
        cpu_relax_yield();
        if (msdata->state != curstate) {
            curstate = msdata->state;
            switch (curstate) {
            case MULTI_STOP_DISABLE_IRQ:
                local_irq_disable();
                hard_irq_disable();
                break;
            case MULTI_STOP_RUN:
                if (is_active)
                    err = msdata->fn(msdata->data);
                break;
            default:
                break;
            }
            ack_state(msdata);
        } else if (curstate > MULTI_STOP_PREPARE) {

言葉だとわかりにくいので、以下図を示します。なお、以下の図ではCPU数は4とし、横棒はスレッド、縦破線矢印はack_state()の呼び出しを示します。
また、四角内数値はthread_ackで、ack_state()呼び出しによりデクリメントされ、thread_ackが0になったとき、状態遷移(msdata->stateの変更)とthread_ackのリセットがなされることも表現しています。
これによって、各スレッドが足並みをそろえながら同じ状態に遷移します。

なお、msdata->stateのとる値は以下の通りで、その状態は以下の順序で示した順番で遷移します。

順序	状態名	概要
1	MULTI_STOP_PREPARE	初期状態
2	MULTI_STOP_DISABLE_IRQ	初期状態の次で、該当CPU上での割り込みを禁止し、stop_machine()で要求された処理実行の準備を行う
3	MULTI_STOP_RUN	stop_machine()で要求された処理を実行する。multi_cpu_stop()の実装を見るとわかりますが、この状態のとき、stop_machine()で指定された処理が実行されます。
4	MULTI_STOP_EXIT	stop_machine()で要求された処理実行を完了した

これらの情報を参考にmulti_cpu_stop()を各自で読んでみましょう。きっとstop_machine()の仕組みがわかるかなと思います。

おわりに

やはり、カーネルのコードを読むのは楽しいですね。stop_machine()、ちょっと厨二病っぽい名前でしたが、足並みをそろえるための工夫は面白いなと感じました。
カーネルに限った話ではないですが、ちょっと疑問に思ったらコードを読んでみて、わかるところから読み、理解することを繰り返すといろいろと勉強になります。というより、そんな教訓めいた話をしなくても単純に面白いですね。
最後に、良い記事を書いていただき、今回のコード読みをするきっかけと作って下さった @satoru_takeuchi さん、どうもありがとうございました。
それでは、また。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up