2.8.4 算法思路

我们先来看调度器如何在调度域内部做负载均衡，这也是整个负载均衡逻辑的核心。完成该任务的函数是 load_balance, 其核心代码如下：

/* file: kernel/sched/fair.c */

static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *continue_balancing) {

  int ld_moved, cur_ld_moved, active_balance = 0;
  struct sched_group *group;
  struct rq *busiest;
  struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

  /* lb_env 用来封装负载均衡的上下文信息 */
  struct lb_env env = {
      .sd = sd,
      .dst_cpu = this_cpu,
      .dst_rq = this_rq,
      .dst_grpmask = sched_group_span(sd->groups),
      .idle = idle,
      .loop_break = sched_nr_migrate_break,
      .cpus = cpus,
      .fbq_type = all,
      .tasks = LIST_HEAD_INIT(env.tasks),
  };

  /* 1.
   * 判断是否需要在当前调度域的调度组之间做负载均衡
   *
   判断的逻辑是当前CPU(即第一个参数：this_cpu)是否空闲并需要从其它地方拉一点任务过来
   */
  if (!should_we_balance(&env)) {
    /* 如果不需要做负载均衡，则退出并将continue_balancing置0,
     * 该变量在上层函数中会用到，用以判断要不要继续遍历上层调度域 */
    , *continue_balancing = 0;
    goto out_balanced;
  }

  /* 找到当前调度域中最繁忙的调度组，主要思路是对比各个调度组的负载、算力以及CPU利用率，以此来确定哪个调度组最繁忙
   */
  group = find_busiest_group(&env);
  if (!group) {
    goto out_balanced;
  }

  /* 从最繁忙的调度组中找出最繁忙的运行队列，从该队列中拉取任务到当前CPU以平衡负载
   */
  busiest = find_busiest_queue(&env, group);
  if (!busiest) {
    goto out_balanced;
  }

  /* 设置负载均衡的源CPU及源队列 */
  env.src_cpu = busiest->cpu;
  env.src_rq = busiest;

  /* 将从源队列拉取任务 */
  cur_ld_moved = detach_tasks(&env);

  if (cur_ld_moved) {
    /* 将拉取出来的任务放置到目标队列上，目标队列即当前CPU的运行队列 */
    attach_tasks(&env);
    ld_moved += cur_ld_moved;
  }

  /* 删除了大量的代码 */
}

这里我们删除了绝大多数处理细节的代码，仅保留了最核心的逻辑，调度器做负载均衡的目的是为了提高CPU的总体利用率，因此在迁移任务时需要考虑各种因素，避免引入额外的性能开销。上面代码中的每一步都包含了非常多的细节，我们在这里最关心总体逻辑，不再一一深入。例如函数 detach_tasks 在选取目标任务时，就会判断任务是否可以被迁移到其它CPU, 一个场景是如果任务在其当前的CPU上的缓存还是热（cache-hot）的，那么将其迁移到其它CPU便会带来额外的性能开销，这就违背了做负载均衡的最初动机，因此调度器就不会对这类任务做迁移。

了解了单个调度域内的负载均衡之后，我们再来看一下整个系统层面的负载均衡过程。系统触发负载均衡的时机我们会在下一节讨论，这里我们仅通过函数 rebalance_domains 的实现了解总体逻辑：

/* file: kernel/sched/fair.c */

static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) {
  int continue_balancing = 1;
  int cpu = rq->cpu;
  int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
  unsigned long interval;
  struct sched_domain *sd;

  for_each_domain(cpu, sd) {
    /* 调用函数load_balance时会将该变量指针传递进去，如果没有必要继续往上遍历调度域，则会将该变量置0
     */
    if (!continue_balancing) {
      break;
    }

    /* 计算出当前调度域负载均衡的时间间隔，这里会考虑CPU是否繁忙以及调度域的时间间隔区间
     */
    interval = get_sd_balance_interval(sd, busy);

    /* 如果当前时间已经超过负载均衡的时间间隔，则触发负载均衡操作 */
    if (time_after_eq(jiffies, sd->last_balance + interval)) {
      /* 调用 load_balance 对调度域 sd 进行负载均衡 */
      if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
        idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
        busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
      }
      /* 设置本次负载均衡的时间，也就是最近一次负载均衡的时间 */
      sd->last_balance = jiffies;
      /* 由于进行了负载均衡，动态计算时间间隔的各种因子可能发生了变化，因此这里重新计算一次
       */
      interval = get_sd_balance_interval(sd, busy);
    }
  }
}

这里我们也删除了很多处理细节的代码，主要是为了排除干扰，呈现出最根本的处理逻辑。主题逻辑包含在一个迭代中，该迭代通过宏 for_each_domain 来对当前CPU的调度域进行遍历，其定义如下：

/* file: kernel/sched/fair.c */
#define for_each_domain(cpu, __sd)                                             \
  for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd;       \
       __sd = __sd->parent)

cpu_rq(cpu)->sd 拿到的就是CPU最底层的调度域，即前文提到的Base Domain, 该宏自底向上逐级遍历指定CPU的调度域，按照这个顺序依次对各级的调度域做负载均衡，并且在合适的时候退出循环，越上层的调度域触发负载均衡的频率越低。

Previous2.8.3 数据结构 Next2.8.5 触发时机

Last updated 3 years ago

Was this helpful?