3.2.1 数据结构

3.2.1.1 页表

分页功能需要页表的来支持,页表的级数与OS版本与硬件架构有关,Linux当前版本最高支持5级页表,可以通过编译配置参数 CONFIG_PGTABLE_LEVELS 来控制开启页表的级数。各级页表分别是:

  • Page Global Directory(PGD)

  • Page 4 Directory(P4D)

  • Page Upper Directory(PUD)

  • Page Middle Directory(PMD)

  • Page Table Entry Directory(PTE)

内核为每级页表的表项都专门定义了一个数据类型:

/* file: arch/x86/include/asm/pgtable_64_types.h */

typedef unsigned long   pteval_t;
typedef unsigned long   pmdval_t;
typedef unsigned long   pudval_t;
typedef unsigned long   p4dval_t;
typedef unsigned long   pgdval_t;
typedef unsigned long   pgprotval_t;

typedef struct { pteval_t pte; } pte_t;

本质上这些类型都是无符号长整型,特意声明成单独的类型是为了让编译器做类型检查。各页表项的类型定义如下:

/* file: arch/x86/include/asm/pgtable_types.h */

typedef struct { pgdval_t pgd; } pgd_t;

#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;

#else
#include <asm-generic/pgtable-nop4d.h>
#endif

#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
#else
#include <asm-generic/pgtable-nopud.h>
#endif

#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;

#else
#include <asm-generic/pgtable-nopmd.h>
#endif

各个 pgtable-nop[4um]d.h 文件内包含着未开启对应级别页表时的一些声明,这里不展开。

各级页表中的表项(entry)的内容都是一样的,都是关于目标页的一些标记(Flags),典型的标记包括:

  • Present flag 目标页是否在主存中

  • Dirty flag 目标页面的内容是否发生了改变

  • Read/Write flag 标记目标页面的读写权限

  • User/Supervisor flag 目标页面的权限,例如用户进程就无权访问内核的虚拟地址空间

其他的各种标记不在这里一一列出,总之,无符号整型的位数足够表示所有的状态。

3.2.1.2 Page

页是OS管理物理内存的基本单元,每个物理内存页被称为一个页帧(Page Frame),每个页帧都会有一个编号,被称为PFN(Page Frame Number). 对于每个物理页帧,内核都会创建一个叫 page 的数据结构来追踪该页的各种信息与状态, page 是内存管理的核心,我们这里将其完整的代码贴出:

/* file: include/linux/mm_types.h */

#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
#define _struct_page_alignment __aligned(2 * sizeof(unsigned long))
#else
#define _struct_page_alignment
#endif

struct page {
  unsigned long flags; /* Atomic flags, some possibly
                        * updated asynchronously */
  /*
   * Five words (20/40 bytes) are available in this union.
   * WARNING: bit 0 of the first word is used for PageTail(). That
   * means the other users of this union MUST NOT use the bit to
   * avoid collision and false-positive PageTail().
   */
  union {
    struct { /* Page cache and anonymous pages */
      /**
       * @lru: Pageout list, eg. active_list protected by
       * lruvec->lru_lock.  Sometimes used as a generic list
       * by the page owner.
       */
      struct list_head lru;
      /* See page-flags.h for PAGE_MAPPING_FLAGS */
      struct address_space *mapping;
      pgoff_t index; /* Our offset within mapping. */
      /**
       * @private: Mapping-private opaque data.
       * Usually used for buffer_heads if PagePrivate.
       * Used for swp_entry_t if PageSwapCache.
       * Indicates order in the buddy system if PageBuddy.
       */
      unsigned long private;
    };
    struct { /* page_pool used by netstack */
      /**
       * @dma_addr: might require a 64-bit value on
       * 32-bit architectures.
       */
      unsigned long dma_addr[2];
    };
    struct { /* slab, slob and slub */
      union {
        /* slab 列表,slab 可能在 partial */
        struct list_head slab_list;
        struct { /* Partial pages */
          struct page *next;
#ifdef CONFIG_64BIT
          int pages; /* Nr of pages left */
          int pobjects; /* Approximate count */
#else
          short int pages;
          short int pobjects;
#endif
        };
      };
      /* 使用该页作为 slab 的 kmem_cache, 通过文件 slub.c 中的函数 allocate_slab() 设置 */
      struct kmem_cache *slab_cache; /* not slob */
      /* Double-word boundary */
      /* 当页面用于 slab 缓存时,slab 的首页对应的 page 的该字段会指向整个 slab 的空闲对象列表。
       * 但当 slab 当前正在被 kmem_cache_cpu 使用时,page 的该字段会设置为 NULL, 而 kmem_cache_cpu 中的 freelist 字段会指向 slab 的空闲对象列表 */
      void *freelist; /* first free object */
      union {
        void *s_mem; /* slab: first object */
        /* 因为 counters 与下面包含 inuse, objects, frozen 字段的结构体是 union 关系,所以很多时候需要新建 page 然后对后面三个字段赋值时,直接将 counters 的值付过去就 OK 了 */
        unsigned long counters; /* SLUB */
        struct { /* SLUB */
          /* 记录被使用的对象,但是初始值与 objects 相同 */
          unsigned inuse : 16;
          /* 记录 slab 中包含 object 的总数,即为 kmem_cache 中 kmem_cache_order_objects 中低 15 位表示的值。该值在 slub.c 中的函数 allocate_slab 中设置 */
          unsigned objects : 15;
          /* 标记该 slab 是否被某个 cpu “锁定”,如果处于 frozen 状态,那么只有对应的CPU 能够从该slab 中分配对象,其他CPU 只能往该页面释放对象。初始值设置为 1 */
          unsigned frozen : 1;
        };
      };
    };
    struct { /* Tail pages of compound page */
      unsigned long compound_head; /* Bit zero is set */

      /* First tail page only */
      unsigned char compound_dtor;
      unsigned char compound_order;
      atomic_t compound_mapcount;
      unsigned int compound_nr; /* 1 << compound_order */
    };
    struct { /* Second tail page of compound page */
      unsigned long _compound_pad_1; /* compound_head */
      atomic_t hpage_pinned_refcount;
      /* For both global and memcg */
      struct list_head deferred_list;
    };
    struct { /* Page table pages */
      unsigned long _pt_pad_1; /* compound_head */
      pgtable_t pmd_huge_pte; /* protected by page->ptl */
      unsigned long _pt_pad_2; /* mapping */
      union {
        struct mm_struct *pt_mm; /* x86 pgds only */
        atomic_t pt_frag_refcount; /* powerpc */
      };
#if ALLOC_SPLIT_PTLOCKS
      spinlock_t *ptl;
#else
      spinlock_t ptl;
#endif
    };
    struct { /* ZONE_DEVICE pages */
      /** @pgmap: Points to the hosting device page map. */
      struct dev_pagemap *pgmap;
      void *zone_device_data;
      /*
       * ZONE_DEVICE private pages are counted as being
       * mapped so the next 3 words hold the mapping, index,
       * and private fields from the source anonymous or
       * page cache page while the page is migrated to device
       * private memory.
       * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
       * use the mapping, index, and private fields when
       * pmem backed DAX files are mapped.
       */
    };

    /** @rcu_head: You can use this to free a page by RCU. */
    struct rcu_head rcu_head;
  };

  union { /* This union is 4 bytes in size. */
    /*
     * If the page can be mapped to userspace, encodes the number
     * of times this page is referenced by a page table.
     */
    atomic_t _mapcount;

    /*
     * If the page is neither PageSlab nor mappable to userspace,
     * the value stored here may help determine what this page
     * is used for.  See page-flags.h for a list of page types
     * which are currently stored here.
     */
    unsigned int page_type;

    unsigned int active; /* SLAB */
    int units; /* SLOB */
  };

  /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
  atomic_t _refcount;

#ifdef CONFIG_MEMCG
  unsigned long memcg_data;
#endif

  /*
   * On machines where all RAM is mapped into kernel address space,
   * we can simply calculate the virtual address. On machines with
   * highmem some memory is mapped into kernel virtual memory
   * dynamically, so we need a place to store that address.
   * Note that this field could be 16 bits on x86 ... ;)
   *
   * Architectures with slow multiplication can define
   * WANT_PAGE_VIRTUAL in asm/page.h
   */
#if defined(WANT_PAGE_VIRTUAL)
  void *virtual; /* Kernel virtual address (NULL if
                    not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
  int _last_cpupid;
#endif
} _struct_page_alignment;

page 与物理页帧是一一对应关系,OS在初始化时会根据物理内存创建出所有的 page 实例,因此我们必须要控制该结构体的大小,以避免过多的消耗内存。但一个内存页可能被用于各种目的, page 中就需要封装各种状态信息,因此内核工程师们精心设计了该数据结构,可以看到很多属性之间都是 union 关系,可以理解为虽然系统在方方面面都需要使用内存,但一个物理内存页一次只能用于一种目的,用于不同目的的字段之间可以是“或”的关系,这样就显著地降低了整个数据结构的大小。

我们在这里先不关心每个字段的意义,后续章节讲到具体的应用场合时会针对性地做详细介绍,不过我们可以先看一下比较重要的字段:

  • flags 该字段是一个无符号长整型,用来存放各种类型的页标记,内核有个专门的文件来定义各种标记位,叫着 include/linux/page-flags.h, 例如标记 PG_locked 表示当前页被加锁了。

    flags字段是一个寸土寸金的地方,只有最重要的标记才有资格在该字段中占有一席之地。

  • _refcount 引用计数,如果是 -1 的话说明没有该页没有被使用,可以重新分配给需要的进程。

3.2.1.3 Zone

理想情况下,内存中的所有页面从功能上讲都是等价的,都可以用于任何目的,但现实却并非如此,例如一些DMA处理器只能访问固定范围内的地址空间(见这里)。因此内核将整个内存地址空间划分成了不同的区,每个区叫着一个 Zone, 每个 Zone 都有自己的用途。

Zone 的划分与系统架构与位数相关,典型x86的32位系统拥有如下分区:

  • ZONEDMA: 用于DMA, 包含16M以内的内存页

  • ZONENORMAL: 包含16M到896M的内存页,这部分内存直接映射到了内核的虚拟地址空间

  • ZONEHIGHMEM: 包含大于896M的内存页,内核通过这部分地址空间来动态映射额外的内存或者IO

什么是ZONEHIGHMEM及为什么需要它?

我们知道32位系统的虚拟地址空间为4G, 并且内核与用户使用相同的虚拟地址空间(详见上一节),内核使用的地址范围是 0xC00000000xFFFFFFFF, 即高位的1G. 也就是说,能够映射到内核地址空间的物理内存只有1G, 如果系统有更大的物理内存,内核也无法使用。为了能够灵活地映射到更多的内存,Linux 将内核的地址空间划分成两块,小于896M 的范围称为低端内存(Low Memory),高于该部分的称为高端内存(High Memory)。

我们知道虚拟地址到物理地址的转化需要页表的支持与MMU的参与,但内核为了效率,将低端内存与物理内存直接关联了起来,具体方法是将 0 到896M 的物理地址直接加上偏移量 0xC0000000 映射到低端内存的地址空间,这样内核在使用低端内存时,物理地址与虚拟地址的转换就非常简单。而128M 的高端内存地址则通过页表动态映射其它的物理内存,操作系统有多种方式来进行这种映射,这里不展开。

除了协助访问更大的物理地址,内核还需要预留一部分地址空间用于其他用途,例如映射IO地址等。

在64位系统中,由于寻址范围的增加,用于内核的虚拟地址空间已经足够大了,因此就不再需要ZONEHIGHMEM了。

内核对 Zone 的划分是可配置的,申明代码如下:

/* file: include/linux/mmzone.h */

enum zone_type {
  /*
   * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
   * to DMA to all of the addressable memory (ZONE_NORMAL).
   * On architectures where this area covers the whole 32 bit address
   * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
   * DMA addressing constraints. This distinction is important as a 32bit
   * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
   * platforms may need both zones as they support peripherals with
   * different DMA addressing limitations.
   */
#ifdef CONFIG_ZONE_DMA
  ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
  ZONE_DMA32,
#endif
  /*
   * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
   * performed on pages in ZONE_NORMAL if the DMA devices support
   * transfers to all addressable memory.
   */
  ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
  /*
   * A memory area that is only addressable by the kernel through
   * mapping portions into its own address space. This is for example
   * used by i386 to allow the kernel to address the memory beyond
   * 900MB. The kernel will set up special mappings (page
   * table entries on i386) for each page that the kernel needs to
   * access.
   */
  ZONE_HIGHMEM,
#endif
  /*
   * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
   * movable pages with few exceptional cases described below. Main use
   * cases for ZONE_MOVABLE are to make memory offlining/unplug more
   * likely to succeed, and to locally limit unmovable allocations - e.g.,
   * to increase the number of THP/huge pages. Notable special cases are:
   *
   * 1. Pinned pages: (long-term) pinning of movable pages might
   *    essentially turn such pages unmovable. Memory offlining might
   *    retry a long time.
   * 2. memblock allocations: kernelcore/movablecore setups might create
   *    situations where ZONE_MOVABLE contains unmovable allocations
   *    after boot. Memory offlining and allocations fail early.
   * 3. Memory holes: kernelcore/movablecore setups might create very rare
   *    situations where ZONE_MOVABLE contains memory holes after boot,
   *    for example, if we have sections that are only partially
   *    populated. Memory offlining and allocations fail early.
   * 4. PG_hwpoison pages: while poisoned pages can be skipped during
   *    memory offlining, such pages cannot be allocated.
   * 5. Unmovable PG_offline pages: in paravirtualized environments,
   *    hotplugged memory blocks might only partially be managed by the
   *    buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
   *    parts not manged by the buddy are unmovable PG_offline pages. In
   *    some cases (virtio-mem), such pages can be skipped during
   *    memory offlining, however, cannot be moved/allocated. These
   *    techniques might use alloc_contig_range() to hide previously
   *    exposed pages from the buddy again (e.g., to implement some sort
   *    of memory unplug in virtio-mem).
   *
   * In general, no unmovable allocations that degrade memory offlining
   * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
   * have to expect that migrating pages in ZONE_MOVABLE can fail (even
   * if has_unmovable_pages() states that there are no unmovable pages,
   * there can be false negatives).
   */
  ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
  ZONE_DEVICE,
#endif
  __MAX_NR_ZONES

};

可以看到一定存在的分区是 ZONE_NORMALZONE_MOVABLE, 其他的Zone都是可选项。分区 ZONE_MOVABLE 是个特殊的分区,该分区内的内存页都必须是可迁移的,主要用途包括实现内存的热插拔与内存规整(以减少内存碎片,这里指页这个粒度的内存碎片)。

在Linux 中,可以通过 proc 文件系统来查看当前系统的内存分区情况,具体命令是: cat /proc/zoneinfo

内核中用来封装 Zone 的数据结构定义如下:

/* file: include/linux/mmzone.h */

struct zone {
  /* Read-mostly fields */

  /* zone watermarks, access with *_wmark_pages(zone) macros */
  unsigned long _watermark[NR_WMARK];
  unsigned long watermark_boost;

  unsigned long nr_reserved_highatomic;

  /*
   * We don't know if the memory that we're going to allocate will be
   * freeable or/and it will be released eventually, so to avoid totally
   * wasting several GB of ram we must reserve some of the lower zone
   * memory (otherwise we risk to run OOM on the lower zones despite
   * there being tons of freeable ram on the higher zones).  This array is
   * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
   * changes.
   */
  long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
  int node;
#endif
  struct pglist_data *zone_pgdat;
  struct per_cpu_pageset __percpu *pageset;
  /*
   * the high and batch values are copied to individual pagesets for
   * faster access
   */
  int pageset_high;
  int pageset_batch;

#ifndef CONFIG_SPARSEMEM
  /*
   * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
   * In SPARSEMEM, this map is stored in struct mem_section
   */
  unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

  /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
  unsigned long zone_start_pfn;

  /*
   * spanned_pages is the total pages spanned by the zone, including
   * holes, which is calculated as:
   *    spanned_pages = zone_end_pfn - zone_start_pfn;
   *
   * present_pages is physical pages existing within the zone, which
   * is calculated as:
   *    present_pages = spanned_pages - absent_pages(pages in holes);
   *
   * managed_pages is present pages managed by the buddy system, which
   * is calculated as (reserved_pages includes pages allocated by the
   * bootmem allocator):
   *    managed_pages = present_pages - reserved_pages;
   *
   * cma pages is present pages that are assigned for CMA use
   * (MIGRATE_CMA).
   *
   * So present_pages may be used by memory hotplug or memory power
   * management logic to figure out unmanaged pages by checking
   * (present_pages - managed_pages). And managed_pages should be used
   * by page allocator and vm scanner to calculate all kinds of watermarks
   * and thresholds.
   *
   * Locking rules:
   *
   * zone_start_pfn and spanned_pages are protected by span_seqlock.
   * It is a seqlock because it has to be read outside of zone->lock,
   * and it is done in the main allocator path.  But, it is written
   * quite infrequently.
   *
   * The span_seq lock is declared along with zone->lock because it is
   * frequently read in proximity to zone->lock.  It's good to
   * give them a chance of being in the same cacheline.
   *
   * Write access to present_pages at runtime should be protected by
   * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
   * present_pages should get_online_mems() to get a stable value.
   */
  atomic_long_t managed_pages;
  unsigned long spanned_pages;
  unsigned long present_pages;
#ifdef CONFIG_CMA
  unsigned long cma_pages;
#endif

  const char *name;

#ifdef CONFIG_MEMORY_ISOLATION
  /*
   * Number of isolated pageblock. It is used to solve incorrect
   * freepage counting problem due to racy retrieving migratetype
   * of pageblock. Protected by zone->lock.
   */
  unsigned long nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
  /* see spanned/present_pages for more description */
  seqlock_t span_seqlock;
#endif

  int initialized;

  /* Write-intensive fields used from the page allocator */
  ZONE_PADDING(_pad1_)

  /* free areas of different sizes */
    struct free_area free_area[MAX_ORDER];

  /* zone flags, see below */
  unsigned long flags;

  /* Primarily protects free_area */
  spinlock_t lock;

  /* Write-intensive fields used by compaction and vmstats. */
  ZONE_PADDING(_pad2_)

  /*
   * When free pages are below this point, additional steps are taken
   * when reading the number of free pages to avoid per-cpu counter
   * drift allowing watermarks to be breached
   */
    unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
  /* pfn where compaction free scanner should start */
  unsigned long compact_cached_free_pfn;
  /* pfn where compaction migration scanner should start */
  unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC];
  unsigned long compact_init_migrate_pfn;
  unsigned long compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
  /*
   * On compaction failure, 1<<compact_defer_shift compactions
   * are skipped before trying again. The number attempted since
   * last failure is tracked with compact_considered.
   * compact_order_failed is the minimum compaction failed order.
   */
  unsigned int compact_considered;
  unsigned int compact_defer_shift;
  int compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
  /* Set to true when the PG_migrate_skip bits should be cleared */
  bool compact_blockskip_flush;
#endif

  bool contiguous;

  ZONE_PADDING(_pad3_)
  /* Zone statistics */
    atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
  atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;

其中重要的字段如下:

  • _watermark 水位线用来表示 Zone 中内存的使用情况,用来触发内存回收或 swap 等行为。定义如下:

       /* file: include/linux/mmzone.h */
    
       enum zone_watermarks {
         WMARK_MIN,                /* 最低水位,表示内存严重不够用了 */
         WMARK_LOW,                /* 低水位,内存已经开始有一定有压力了 */
         WMARK_HIGH,               /* 高水位,表示内存充足 */
         NR_WMARK                  /* 水位线个数,用作 zone 中的水位线数组长度 */
       };
  • struct pglist_data *zone_pgdat 本Zone所在的Node, Node的概念在下一节介绍

  • struct per_cpu_pageset __percpu *pageset Zone 是一个全局性的变量,多个CPU在对 Zone 中的内存进行分配和释放的过程中会面临严重的竞争问题,于是系统在 Zone 中为每个 CPU 设置了一个本地缓存,该字段就是用来管理每个CPU的缓存页面的。

  • zone_start_pfn Zone 的起始页帧号

  • managed_pages, spanned_pages, present_pages 记录被 Zone 管理的内存页数量,每个字段的含义及计算方式见注释

  • name Zone 的名称,例如 “DMA”, “NORMAL”等

  • struct free_area free_area[MAX_ORDER]; 用于 Buddy System, 后续讲 Buddy System 时会详细介绍

  • flags 用来标记 Zone 的各种信息

因为 Zone 被CPU访问地非常频繁,为了提升访问效率,整个数据结构要求对齐CPU的L1缓存。同时整个数据结构被 ZONE_PADDING 分割成了几部分,目的是为了将用于同一目的的字段聚合在同一个缓存 Line 中。

由于 Zone 是根据内存的用途进行划分的,所以 Zone 也是系统进行内存分配的直接来源,内核最底层的内存管理系统Buddy System就是基于Zone来对内存进行分配与释放的。

3.2.1.4 Node

在介绍调度器的负载均衡时我们讨论过不同的CPU拓扑结构,其主要区别体现在不同CPU对内存的访问方式上,在NUMA(Non-uniform Memory Access) 架构下,每个CPU集群都有一个自己的本地内存,每个这样的本地内存在内核中被叫着一个Node, 使用数据结构 pglist_data 来表示,该数据结构定义如下:

/* file: include/linux/mmzone.h */

/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
typedef struct pglist_data {
  /*
   * node_zones contains just the zones for THIS node. Not all of the
   * zones may be populated, but it is the full list. It is referenced by
   * this node's node_zonelists as well as other node's node_zonelists.
   */
  struct zone node_zones[MAX_NR_ZONES];

  /*
   * node_zonelists contains references to all zones in all nodes.
   * Generally the first zones will be references to this node's
   * node_zones.
   */
  struct zonelist node_zonelists[MAX_ZONELISTS];

  int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
  struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
  struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
  /*
   * Must be held any time you expect node_start_pfn,
   * node_present_pages, node_spanned_pages or nr_zones to stay constant.
   * Also synchronizes pgdat->first_deferred_pfn during deferred page
   * init.
   *
   * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
   * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
   * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
   *
   * Nests above zone->lock and zone->span_seqlock
   */
  spinlock_t node_size_lock;
#endif
  unsigned long node_start_pfn;
  unsigned long node_present_pages; /* total number of physical pages */
  unsigned long node_spanned_pages; /* total size of physical page
                                       range, including holes */
  int node_id;
  wait_queue_head_t kswapd_wait;
  wait_queue_head_t pfmemalloc_wait;
  struct task_struct *kswapd; /* Protected by
                                 mem_hotplug_begin/end() */
  int kswapd_order;
  enum zone_type kswapd_highest_zoneidx;

  int kswapd_failures; /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
  int kcompactd_max_order;
  enum zone_type kcompactd_highest_zoneidx;
  wait_queue_head_t kcompactd_wait;
  struct task_struct *kcompactd;
#endif
  /*
   * This is a per-node reserve of pages that are not available
   * to userspace allocations.
   */
  unsigned long totalreserve_pages;

#ifdef CONFIG_NUMA
  /*
   * node reclaim becomes active if more unmapped pages exist.
   */
  unsigned long min_unmapped_pages;
  unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */

  /* Write-intensive fields used by page reclaim */
  ZONE_PADDING(_pad1_)

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
  /*
   * If memory initialisation on large machines is deferred then this
   * is the first PFN that needs to be initialised.
   */
    unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  struct deferred_split deferred_split_queue;
#endif

  /* Fields commonly accessed by the page reclaim scanner */

  /*
   * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
   *
   * Use mem_cgroup_lruvec() to look up lruvecs.
   */
  struct lruvec __lruvec;

  unsigned long flags;

  ZONE_PADDING(_pad2_)

  /* Per-node vmstats */
    struct per_cpu_nodestat __percpu *per_cpu_nodestats;
  atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;

重要的字段包括:

  • struct zone node_zones[MAX_NR_ZONES]; 该 node 划分出来的分区

  • struct zonelist node_zonelists[MAX_ZONELISTS]; 用来串起所有 node 中所有 zone, 由于 Zone 是内存分配时直接打交道的对象,当当前node没有足够的内存时,系统需要从其他 zone 中去请求内存,这个列表就是用来方便遍历用的

  • struct page *node_mem_map; 用来记录本 node 的所有 page, 后续介绍物理内存模型时会涉及

  • node_start_pfn 本 node 的起始 PFN(Page Frame Number)

  • node_present_pages, node_spanned_pages 该 node 中对应的 presentpages 与 spannedpages, 具体意义参见 Zone 中的对应字段

  • flags 记录 node 的各种标记

在 UMA(Uniform Memory Access)架构下,系统所有的内存都使用一个 node 来表示。node, zone, page 是内存管理模块最核心的三个数据结构,三者的基本机构如下:

Last updated