记录学习ARM Linux的多核启动过程

1.概述
本文主要是记录学习Linux的多核启动的过程，对学习过程进行总结，以便进行后续回顾。
平台：ARM Vexpress
内核版本：linux-4.9
2.smp_operations初始化
系统启动过程中，Linux kernel提供了smp boot实现的框架，要实现smp boot，先要填充好smp_operations这个结构体，smp_operations结构体定义如下所示：
struct smp_operations {
#ifdef CONFIG_SMP
/*
* Setup the set of possible CPUs (via set_cpu_possible)
*/
void (*smp_init_cpus)(void);
/*
* Initialize cpu_possible map, and enable coherency
*/
void (*smp_prepare_cpus)(unsigned int max_cpus);
/*
* Perform platform specific initialisation of the specified CPU.
*/
void (*smp_secondary_init)(unsigned int cpu);
/*
* Boot a secondary CPU, and assign it the specified idle task.
* This also gives us the initial stack to use for this CPU.
*/
int  (*smp_boot_secondary)(unsigned int cpu, struct task_struct *idle);
#ifdef CONFIG_HOTPLUG_CPU
int  (*cpu_kill)(unsigned int cpu);
void (*cpu_die)(unsigned int cpu);
bool  (*cpu_can_disable)(unsigned int cpu);
int  (*cpu_disable)(unsigned int cpu);
#endif
#endif
};
2.1.smp_operations初始化流程
start_kernel()->setup_arch()
在该函数中，通过以下代码初始化smp_operations结构：
#ifdef CONFIG_SMP
if (is_smp()) {
if (!mdesc->smp_init || !mdesc->smp_init()) {
if (psci_smp_available())
smp_set_ops(&psci_smp_ops);
else if (mdesc->smp)
smp_set_ops(mdesc->smp);
}
smp_init_cpus();
smp_build_mpidr_hash();
}
#endif
其中mdesc(机器描述符)是在arch/arm/mach-vexpress/v2m.c中通过DT_MACHINE_START宏来定义并初始化的，如下所示。
DT_MACHINE_START(VEXPRESS_DT, "ARM-Versatile Express")
.dt_compat = v2m_dt_match,
.l2c_aux_val = 0x00400000,
.l2c_aux_mask = 0xfe0fffff,
.smp = smp_ops(vexpress_smp_dt_ops),
.smp_init = smp_init_ops(vexpress_smp_init_ops),
MACHINE_END
由于mdesc->smp_init是非空的，且mdesc->smp_init()的返回值为false，该函数实现如下：
bool __init vexpress_smp_init_ops(void)
{
#ifdef CONFIG_MCPM
/*
* The best way to detect a multi-cluster configuration at the moment
* is to look for the presence of a CCI in the system.
* Override the default vexpress_smp_ops if so.
*/
struct device_node *node;
node = of_find_compatible_node(NULL, NULL, "arm,cci-400");
if (node && of_device_is_available(node)) {
mcpm_smp_set_ops();
return true;
}
#endif
return false;
}
在内核的配置中打开了CONFIG_MCPM的配置项(不太明白为什么打开改配置，难道ARM Vexpress是multi-cluster？)，但dts并没有配置了compatible="arm,cci-400"的节点，因此该函数返回false。由于dts也没有配置psci节点，因此在初始化smp_operations结构时，会调用smp_set_ops()函数对smp_operations结构进行初始化，其代码如下：
void __init smp_set_ops(const struct smp_operations *ops)
{
if (ops)
smp_ops = *ops;
};
由上可见，该函数只是把传下来的mdesc->smp赋值给了smp_ops，smp_ops是一个全局变量，定义在arch/arm/kernel/smp.c中，如下所示：
static struct smp_operations smp_ops __ro_after_init;
而mdesc->smp是指向smp_ops的，smp_ops是一个宏，其定义如下：
#ifdef CONFIG_SMP
#define smp_ops(ops) (&(ops))
#define smp_init_ops(ops) (&(ops))
#else
#define smp_ops(ops) (struct smp_operations *)NULL
#define smp_init_ops(ops) (bool (*)(void))NULL
#endif
因此，smp_ops即是vexpress_smp_dt_ops，该结构的定义如下：
const struct smp_operations vexpress_smp_dt_ops __initconst = {
.smp_prepare_cpus = vexpress_smp_dt_prepare_cpus,
.smp_secondary_init = versatile_secondary_init,
.smp_boot_secondary = versatile_boot_secondary,
#ifdef CONFIG_HOTPLUG_CPU
.cpu_die = vexpress_cpu_die,
#endif
};
2.2.多核启动
多核的启动函数调用流程主要如下所示：
start_kernel()->rest_init()->kernel_init()->kernel_init_freeable()->smp_init()
在smp_init()中，会通过for_each_present_cpu，让每一个present的cpu wakeup起来，代码如下：
/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
unsigned int cpu;
idle_threads_init();
cpuhp_threads_init();
/* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
break;
cpu_up(cpu);
}
/* Any cleanup work */
smp_announce();
smp_cpus_done(setup_max_cpus);
}
其中cpu_up()的调用流程如下:
cpu_up()->do_cpu_up()->_cpu_up()->cpuhp_up_callbacks()->cpuhp_invoke_callback()
cpu_up()调用do_cpu_up()时主要传两个参数，一个cpuid，一个cpu状态，如下所示：
int cpu_up(unsigned int cpu)
{
return do_cpu_up(cpu, CPUHP_ONLINE);
}
EXPORT_SYMBOL_GPL(cpu_up);
而在_cpu_up()中会根据cpu状态通过一个min宏通过与CPUHP_BRINGUP_CPU比较取最小的一个，代码如下：
/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct task_struct *idle;
int ret = 0;
cpu_hotplug_begin();
if (!cpu_present(cpu)) {
ret = -EINVAL;
goto out;
}
/*
* The caller of do_cpu_up might have raced with another
* caller. Ignore it for now.
*/
if (st->state >= target)
goto out;
if (st->state == CPUHP_OFFLINE) {
/* Let it fail before we try to bring the cpu up */
idle = idle_thread_get(cpu);
if (IS_ERR(idle)) {
ret = PTR_ERR(idle);
goto out;
}
}
cpuhp_tasks_frozen = tasks_frozen;
st->target = target;
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
*/
if (st->state > CPUHP_BRINGUP_CPU) {
ret = cpuhp_kick_ap_work(cpu);
/*
* The AP side has done the error rollback already. Just
* return the error code..
*/
if (ret)
goto out;
}
/*
* Try to reach the target state. We max out on the BP at
* CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
* responsible for bringing it up to the target state.
*/
target = min((int)target, CPUHP_BRINGUP_CPU);
ret = cpuhp_up_callbacks(cpu, st, target);
out:
cpu_hotplug_done();
return ret;
}
而CPUHP_BRINGUP_CPU这些值实在cpuhotplug.h的枚举变量cpuhp_state中枚举出来，列举几个如下所示：
enum cpuhp_state {
CPUHP_OFFLINE,
CPUHP_CREATE_THREADS,
CPUHP_PERF_PREPARE,
CPUHP_PERF_X86_PREPARE,
CPUHP_PERF_X86_UNCORE_PREP,
...
}
从该变量可以看出CPUHP_ONLINE的值是最大的，因此_cpu_up()调用cpuhp_up_callbacks()时传入的target为CPUHP_BRINGUP_CPU。
进入到cpuhp_up_callbacks()后，由于st->state是0，是小于传下来的target的，因此会通过一个while循环，每个cpu都遍历所有满足st->state < CPUHP_BRINGUP_CPU,来进行启动其他cpu的一些准备工作，代码如下所示：
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
   enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
if (ret) {
st->target = prev_state;
undo_cpu_up(cpu, st);
break;
}
}
return ret;
}
进入到cpuhp_invoke_callback()函数后，首先根据传下来的st->state通过cpuhp_get_step()函数从全局数组cpuhp_bp_states[]中拿到相应的struct cpuhp_step结构变量，因此这里会遍历调用cpuhp_bp_states数组元素里的回调函数，代码如下：
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
bool bringup, struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct cpuhp_step *step = cpuhp_get_step(state);
int (*cbm)(unsigned int cpu, struct hlist_node *node);
int (*cb)(unsigned int cpu);
int ret, cnt;
if (!step->multi_instance) {
cb = bringup ? step->startup.single : step->teardown.single;
if (!cb)
return 0;
trace_cpuhp_enter(cpu, st->target, state, cb);
ret = cb(cpu);
trace_cpuhp_exit(cpu, st->state, state, ret);
return ret;
}
cbm = bringup ? step->startup.multi : step->teardown.multi;
if (!cbm)
return 0;
/* Single invocation for instance add/remove */
if (node) {
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
return ret;
}
/* State transition. Invoke on all instances */
cnt = 0;
hlist_for_each(node, &step->list) {
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
if (ret)
goto err;
cnt++;
}
return 0;
err:
/* Rollback the instances if one failed */
cbm = !bringup ? step->startup.multi : step->teardown.multi;
if (!cbm)
return ret;
hlist_for_each(node, &step->list) {
if (!cnt--)
break;
cbm(cpu, node);
}
return ret;
}
其中cpuhp_bp_states是定义在kernel/cpu.c中，列举几个如下所示：
/* Boot processor state steps */
static struct cpuhp_step cpuhp_bp_states[] = {
[CPUHP_OFFLINE] = {
.name = "offline",
.startup.single = NULL,
.teardown.single = NULL,
},
#ifdef CONFIG_SMP
[CPUHP_CREATE_THREADS]= {
.name = "threads:prepare",
.startup.single = smpboot_create_threads,
.teardown.single = NULL,
.cant_stop = true,
},
[CPUHP_PERF_PREPARE] = {
.name = "perf:prepare",
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
.startup.single = workqueue_prepare_cpu,
.teardown.single = NULL,
},
[CPUHP_HRTIMERS_PREPARE] = {
.name = "hrtimers:prepare",
.startup.single = hrtimers_prepare_cpu,
.teardown.single = hrtimers_dead_cpu,
},
...
[CPUHP_TIMERS_DEAD] = {
.name = "timers:dead",
.startup.single = NULL,
.teardown.single = timers_dead_cpu,
},
/* Kicks the plugged cpu into life */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
.teardown.single = NULL,
.cant_stop = true,
},
...
[CPUHP_TEARDOWN_CPU] = {
.name = "cpu:teardown",
.startup.single = NULL,
.teardown.single = takedown_cpu,
.cant_stop = true,
},
#else
[CPUHP_BRINGUP_CPU] = { },
#endif
};
由上分析，当遍历到CPUHP_BRINGUP_CPU元素时，便会调用到bringup_cpu()函数来启动对应的cpu，该函数的主要调用流程如下：
bringup_cpu()->__cpu_up()-> smp_ops.smp_boot_secondary()
看到smp_ops.smp_boot_secondary这个回调会不会感觉到很熟悉，没错，这就是之前初始化好的，定义在arch/arm/kernel/smp.c中的全局变量smp_ops，因此该回调函数的实现如下：
int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
{
unsigned long timeout;
/*
* Set synchronisation state between this boot processor
* and the secondary one
*/
spin_lock(&boot_lock);
/*
* This is really belt and braces; we hold unintended secondary
* CPUs in the holding pen until we're ready for them.  However,
* since we haven't sent them a soft interrupt, they shouldn't
* be there.
*/
write_pen_release(cpu_logical_map(cpu));
/*
* Send the secondary CPU a soft interrupt, thereby causing
* the boot monitor to read the system wide flags register,
* and branch to the address found there.
*/
arch_send_wakeup_ipi_mask(cpumask_of(cpu));
timeout = jiffies + (1 * HZ);
while (time_before(jiffies, timeout)) {
smp_rmb();
if (pen_release == -1)
break;
udelay(10);
}
/*
* now the secondary core is starting up let it run its
* calibrations, then wait for it to finish
*/
spin_unlock(&boot_lock);
return pen_release != -1 ? -ENOSYS : 0;
}
分析到这，剩下的启动流程就是与硬件相关了，因此不再深究硬件细节的具体实现，后续可以补充进来。

原作者：lunhui2016