/* file: kernel/sched/fair.c */
void trigger_load_balance(struct rq *rq)
{
/*
* Don't need to rebalance while attached to NULL domain or
* runqueue CPU is not active
*/
if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
return;
/* 检查负载均衡的时间,如果当前时间已经过了可以进行下次负载均衡的时间点,那么就产生 SCHED_SOFTIRQ 软中断,
* 该中软信号的处理函数在函数 init_sched_fair_class 中注册,为 run_rebalance_domains */
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
/* 触发 nohz 负载均衡 */
nohz_balancer_kick(rq);
}
/* file: kernel/sched/fair.c */
static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle =
this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE;
/*
* If this CPU has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle CPUs whose ticks are
* stopped. Do nohz_idle_balance *before* rebalance_domains to
* give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
if (nohz_idle_balance(this_rq, idle))
return;
/* normal load balance */
update_blocked_averages(this_rq->cpu);
rebalance_domains(this_rq, idle);
}
繁忙的CPU发起请求,通过IPI唤醒 idle CPU 该逻辑入口函数是 nohz_balancer_kick, 在函数 trigger_load_balance 中被繁忙的CPU随着时钟节拍周期性地调用:
/* file: kernel/sched/fair.c */
void trigger_load_balance(struct rq *rq)
{
/*
* Don't need to rebalance while attached to NULL domain or
* runqueue CPU is not active
*/
if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
return;
/* 检查负载均衡的时间,如果当前时间已经过了可以进行下次负载均衡的时间点,那么就产生 SCHED_SOFTIRQ 软中断,
* 该中软信号的处理函数在函数 init_sched_fair_class 中注册,为 run_rebalance_domains */
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
/* 触发 nohz 负载均衡逻辑 */
nohz_balancer_kick(rq);
}
函数 nohz_balance_kick 首先根据自己队列的任务情况判断是否需要唤醒其他 idle CPU 来为自己分担压力,如果是的话就通过 IPI 对 idle CPU 进行唤醒,被唤醒的CPU会从其它繁忙的CPU拉取任务。内核此处将繁忙的CPU称为 kicker, 目标 idle CPU 称为 kickee, 可以直观地理解成繁忙的CPU将在睡觉的 idle CPU 踢起来干活了。
/* file: kernel/sched/fair.c */
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
* idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
/* 找到 idle CPU */
ilb_cpu = find_new_ilb();
/* 通过 IPI 通知目标的 idle CPU */
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
}
如果是被唤醒的 idle CPU, 则会通过函数 nohz_idle_balance 完成负载均衡,该函数会对所有的 idle CPU 进行负载均衡,负载均衡的逻辑与上一节讲的大致一样,这里不再展开。
5.3. newidle balance
在CPU进入 idle 之前也可以主动发起负载均衡,尝试着从其它 CPU 拉取一些任务过来执行,如果拉取不到再进入 idle 状态也不迟。该逻辑的入口函数就是 newidle_balance, 该函数的主体逻辑为:
/* file: kernel/sched/fair.c */
/*
* newidle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
* < 0 - we released the lock and there are !fair tasks present
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
for_each_domain(this_cpu, sd)
{
int continue_balancing = 1;
u64 t0, domain_cost;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
break;
}
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
pulled_task = load_balance(this_cpu, this_rq, sd,
CPU_NEWLY_IDLE,
&continue_balancing);
domain_cost = sched_clock_cpu(this_cpu) - t0;
if (domain_cost > sd->max_newidle_lb_cost)
sd->max_newidle_lb_cost = domain_cost;
curr_cost += domain_cost;
}
update_next_balance(sd, &next_balance);
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
if (pulled_task || this_rq->nr_running > 0)
break;
}
}