3.2.1 数据结构
3.2.1.1 页表
分页功能需要页表的来支持,页表的级数与OS版本与硬件架构有关,Linux当前版本最高支持5级页表,可以通过编译配置参数 CONFIG_PGTABLE_LEVELS
来控制开启页表的级数。各级页表分别是:
Page Global Directory(PGD)
Page 4 Directory(P4D)
Page Upper Directory(PUD)
Page Middle Directory(PMD)
Page Table Entry Directory(PTE)
内核为每级页表的表项都专门定义了一个数据类型:
/* file: arch/x86/include/asm/pgtable_64_types.h */
typedef unsigned long pteval_t;
typedef unsigned long pmdval_t;
typedef unsigned long pudval_t;
typedef unsigned long p4dval_t;
typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
本质上这些类型都是无符号长整型,特意声明成单独的类型是为了让编译器做类型检查。各页表项的类型定义如下:
/* file: arch/x86/include/asm/pgtable_types.h */
typedef struct { pgdval_t pgd; } pgd_t;
#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;
#else
#include <asm-generic/pgtable-nop4d.h>
#endif
#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
#else
#include <asm-generic/pgtable-nopud.h>
#endif
#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;
#else
#include <asm-generic/pgtable-nopmd.h>
#endif
各个 pgtable-nop[4um]d.h
文件内包含着未开启对应级别页表时的一些声明,这里不展开。
各级页表中的表项(entry)的内容都是一样的,都是关于目标页的一些标记(Flags),典型的标记包括:
Present flag 目标页是否在主存中
Dirty flag 目标页面的内容是否发生了改变
Read/Write flag 标记目标页面的读写权限
User/Supervisor flag 目标页面的权限,例如用户进程就无权访问内核的虚拟地址空间
其他的各种标记不在这里一一列出,总之,无符号整型的位数足够表示所有的状态。
3.2.1.2 Page
页是OS管理物理内存的基本单元,每个物理内存页被称为一个页帧(Page Frame),每个页帧都会有一个编号,被称为PFN(Page Frame Number). 对于每个物理页帧,内核都会创建一个叫 page
的数据结构来追踪该页的各种信息与状态, page
是内存管理的核心,我们这里将其完整的代码贴出:
/* file: include/linux/mm_types.h */
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
#define _struct_page_alignment __aligned(2 * sizeof(unsigned long))
#else
#define _struct_page_alignment
#endif
struct page {
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
/*
* Five words (20/40 bytes) are available in this union.
* WARNING: bit 0 of the first word is used for PageTail(). That
* means the other users of this union MUST NOT use the bit to
* avoid collision and false-positive PageTail().
*/
union {
struct { /* Page cache and anonymous pages */
/**
* @lru: Pageout list, eg. active_list protected by
* lruvec->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
struct list_head lru;
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
/**
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
* Used for swp_entry_t if PageSwapCache.
* Indicates order in the buddy system if PageBuddy.
*/
unsigned long private;
};
struct { /* page_pool used by netstack */
/**
* @dma_addr: might require a 64-bit value on
* 32-bit architectures.
*/
unsigned long dma_addr[2];
};
struct { /* slab, slob and slub */
union {
/* slab 列表,slab 可能在 partial */
struct list_head slab_list;
struct { /* Partial pages */
struct page *next;
#ifdef CONFIG_64BIT
int pages; /* Nr of pages left */
int pobjects; /* Approximate count */
#else
short int pages;
short int pobjects;
#endif
};
};
/* 使用该页作为 slab 的 kmem_cache, 通过文件 slub.c 中的函数 allocate_slab() 设置 */
struct kmem_cache *slab_cache; /* not slob */
/* Double-word boundary */
/* 当页面用于 slab 缓存时,slab 的首页对应的 page 的该字段会指向整个 slab 的空闲对象列表。
* 但当 slab 当前正在被 kmem_cache_cpu 使用时,page 的该字段会设置为 NULL, 而 kmem_cache_cpu 中的 freelist 字段会指向 slab 的空闲对象列表 */
void *freelist; /* first free object */
union {
void *s_mem; /* slab: first object */
/* 因为 counters 与下面包含 inuse, objects, frozen 字段的结构体是 union 关系,所以很多时候需要新建 page 然后对后面三个字段赋值时,直接将 counters 的值付过去就 OK 了 */
unsigned long counters; /* SLUB */
struct { /* SLUB */
/* 记录被使用的对象,但是初始值与 objects 相同 */
unsigned inuse : 16;
/* 记录 slab 中包含 object 的总数,即为 kmem_cache 中 kmem_cache_order_objects 中低 15 位表示的值。该值在 slub.c 中的函数 allocate_slab 中设置 */
unsigned objects : 15;
/* 标记该 slab 是否被某个 cpu “锁定”,如果处于 frozen 状态,那么只有对应的CPU 能够从该slab 中分配对象,其他CPU 只能往该页面释放对象。初始值设置为 1 */
unsigned frozen : 1;
};
};
};
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
/* First tail page only */
unsigned char compound_dtor;
unsigned char compound_order;
atomic_t compound_mapcount;
unsigned int compound_nr; /* 1 << compound_order */
};
struct { /* Second tail page of compound page */
unsigned long _compound_pad_1; /* compound_head */
atomic_t hpage_pinned_refcount;
/* For both global and memcg */
struct list_head deferred_list;
};
struct { /* Page table pages */
unsigned long _pt_pad_1; /* compound_head */
pgtable_t pmd_huge_pte; /* protected by page->ptl */
unsigned long _pt_pad_2; /* mapping */
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
};
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
};
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
void *zone_device_data;
/*
* ZONE_DEVICE private pages are counted as being
* mapped so the next 3 words hold the mapping, index,
* and private fields from the source anonymous or
* page cache page while the page is migrated to device
* private memory.
* ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
* use the mapping, index, and private fields when
* pmem backed DAX files are mapped.
*/
};
/** @rcu_head: You can use this to free a page by RCU. */
struct rcu_head rcu_head;
};
union { /* This union is 4 bytes in size. */
/*
* If the page can be mapped to userspace, encodes the number
* of times this page is referenced by a page table.
*/
atomic_t _mapcount;
/*
* If the page is neither PageSlab nor mappable to userspace,
* the value stored here may help determine what this page
* is used for. See page-flags.h for a list of page types
* which are currently stored here.
*/
unsigned int page_type;
unsigned int active; /* SLAB */
int units; /* SLOB */
};
/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
atomic_t _refcount;
#ifdef CONFIG_MEMCG
unsigned long memcg_data;
#endif
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
} _struct_page_alignment;
page
与物理页帧是一一对应关系,OS在初始化时会根据物理内存创建出所有的 page
实例,因此我们必须要控制该结构体的大小,以避免过多的消耗内存。但一个内存页可能被用于各种目的, page
中就需要封装各种状态信息,因此内核工程师们精心设计了该数据结构,可以看到很多属性之间都是 union
关系,可以理解为虽然系统在方方面面都需要使用内存,但一个物理内存页一次只能用于一种目的,用于不同目的的字段之间可以是“或”的关系,这样就显著地降低了整个数据结构的大小。
我们在这里先不关心每个字段的意义,后续章节讲到具体的应用场合时会针对性地做详细介绍,不过我们可以先看一下比较重要的字段:
flags 该字段是一个无符号长整型,用来存放各种类型的页标记,内核有个专门的文件来定义各种标记位,叫着
include/linux/page-flags.h
, 例如标记PG_locked
表示当前页被加锁了。flags字段是一个寸土寸金的地方,只有最重要的标记才有资格在该字段中占有一席之地。
_refcount 引用计数,如果是 -1 的话说明没有该页没有被使用,可以重新分配给需要的进程。
3.2.1.3 Zone
理想情况下,内存中的所有页面从功能上讲都是等价的,都可以用于任何目的,但现实却并非如此,例如一些DMA处理器只能访问固定范围内的地址空间(见这里)。因此内核将整个内存地址空间划分成了不同的区,每个区叫着一个 Zone
, 每个 Zone
都有自己的用途。
Zone 的划分与系统架构与位数相关,典型x86的32位系统拥有如下分区:
ZONEDMA: 用于DMA, 包含16M以内的内存页
ZONENORMAL: 包含16M到896M的内存页,这部分内存直接映射到了内核的虚拟地址空间
ZONEHIGHMEM: 包含大于896M的内存页,内核通过这部分地址空间来动态映射额外的内存或者IO
什么是ZONEHIGHMEM及为什么需要它?
我们知道32位系统的虚拟地址空间为4G, 并且内核与用户使用相同的虚拟地址空间(详见上一节),内核使用的地址范围是
0xC0000000
到0xFFFFFFFF
, 即高位的1G. 也就是说,能够映射到内核地址空间的物理内存只有1G, 如果系统有更大的物理内存,内核也无法使用。为了能够灵活地映射到更多的内存,Linux 将内核的地址空间划分成两块,小于896M 的范围称为低端内存(Low Memory),高于该部分的称为高端内存(High Memory)。我们知道虚拟地址到物理地址的转化需要页表的支持与MMU的参与,但内核为了效率,将低端内存与物理内存直接关联了起来,具体方法是将 0 到896M 的物理地址直接加上偏移量
0xC0000000
映射到低端内存的地址空间,这样内核在使用低端内存时,物理地址与虚拟地址的转换就非常简单。而128M 的高端内存地址则通过页表动态映射其它的物理内存,操作系统有多种方式来进行这种映射,这里不展开。除了协助访问更大的物理地址,内核还需要预留一部分地址空间用于其他用途,例如映射IO地址等。
在64位系统中,由于寻址范围的增加,用于内核的虚拟地址空间已经足够大了,因此就不再需要ZONEHIGHMEM了。
内核对 Zone
的划分是可配置的,申明代码如下:
/* file: include/linux/mmzone.h */
enum zone_type {
/*
* ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
* to DMA to all of the addressable memory (ZONE_NORMAL).
* On architectures where this area covers the whole 32 bit address
* space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
* DMA addressing constraints. This distinction is important as a 32bit
* DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
* platforms may need both zones as they support peripherals with
* different DMA addressing limitations.
*/
#ifdef CONFIG_ZONE_DMA
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
/*
* ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
* movable pages with few exceptional cases described below. Main use
* cases for ZONE_MOVABLE are to make memory offlining/unplug more
* likely to succeed, and to locally limit unmovable allocations - e.g.,
* to increase the number of THP/huge pages. Notable special cases are:
*
* 1. Pinned pages: (long-term) pinning of movable pages might
* essentially turn such pages unmovable. Memory offlining might
* retry a long time.
* 2. memblock allocations: kernelcore/movablecore setups might create
* situations where ZONE_MOVABLE contains unmovable allocations
* after boot. Memory offlining and allocations fail early.
* 3. Memory holes: kernelcore/movablecore setups might create very rare
* situations where ZONE_MOVABLE contains memory holes after boot,
* for example, if we have sections that are only partially
* populated. Memory offlining and allocations fail early.
* 4. PG_hwpoison pages: while poisoned pages can be skipped during
* memory offlining, such pages cannot be allocated.
* 5. Unmovable PG_offline pages: in paravirtualized environments,
* hotplugged memory blocks might only partially be managed by the
* buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
* parts not manged by the buddy are unmovable PG_offline pages. In
* some cases (virtio-mem), such pages can be skipped during
* memory offlining, however, cannot be moved/allocated. These
* techniques might use alloc_contig_range() to hide previously
* exposed pages from the buddy again (e.g., to implement some sort
* of memory unplug in virtio-mem).
*
* In general, no unmovable allocations that degrade memory offlining
* should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
* have to expect that migrating pages in ZONE_MOVABLE can fail (even
* if has_unmovable_pages() states that there are no unmovable pages,
* there can be false negatives).
*/
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};
可以看到一定存在的分区是 ZONE_NORMAL
与 ZONE_MOVABLE
, 其他的Zone都是可选项。分区 ZONE_MOVABLE
是个特殊的分区,该分区内的内存页都必须是可迁移的,主要用途包括实现内存的热插拔与内存规整(以减少内存碎片,这里指页这个粒度的内存碎片)。
在Linux 中,可以通过 proc 文件系统来查看当前系统的内存分区情况,具体命令是:
cat /proc/zoneinfo
内核中用来封装 Zone
的数据结构定义如下:
/* file: include/linux/mmzone.h */
struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long _watermark[NR_WMARK];
unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
/*
* We don't know if the memory that we're going to allocate will be
* freeable or/and it will be released eventually, so to avoid totally
* wasting several GB of ram we must reserve some of the lower zone
* memory (otherwise we risk to run OOM on the lower zones despite
* there being tons of freeable ram on the higher zones). This array is
* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
* changes.
*/
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;
/*
* the high and batch values are copied to individual pagesets for
* faster access
*/
int pageset_high;
int pageset_batch;
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* cma pages is present pages that are assigned for CMA use
* (MIGRATE_CMA).
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/end(). Any reader who can't tolerant drift of
* present_pages should get_online_mems() to get a stable value.
*/
atomic_long_t managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;
#ifdef CONFIG_CMA
unsigned long cma_pages;
#endif
const char *name;
#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* freepage counting problem due to racy retrieving migratetype
* of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
int initialized;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags;
/* Primarily protects free_area */
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_)
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC];
unsigned long compact_init_migrate_pfn;
unsigned long compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
* compact_order_failed is the minimum compaction failed order.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
bool contiguous;
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
其中重要的字段如下:
_watermark
水位线用来表示 Zone 中内存的使用情况,用来触发内存回收或 swap 等行为。定义如下:/* file: include/linux/mmzone.h */ enum zone_watermarks { WMARK_MIN, /* 最低水位,表示内存严重不够用了 */ WMARK_LOW, /* 低水位,内存已经开始有一定有压力了 */ WMARK_HIGH, /* 高水位,表示内存充足 */ NR_WMARK /* 水位线个数,用作 zone 中的水位线数组长度 */ };
struct pglist_data *zone_pgdat
本Zone所在的Node, Node的概念在下一节介绍struct per_cpu_pageset __percpu *pageset
Zone 是一个全局性的变量,多个CPU在对 Zone 中的内存进行分配和释放的过程中会面临严重的竞争问题,于是系统在 Zone 中为每个 CPU 设置了一个本地缓存,该字段就是用来管理每个CPU的缓存页面的。zone_start_pfn
Zone 的起始页帧号managed_pages
,spanned_pages
,present_pages
记录被 Zone 管理的内存页数量,每个字段的含义及计算方式见注释name
Zone 的名称,例如 “DMA”, “NORMAL”等struct free_area free_area[MAX_ORDER];
用于 Buddy System, 后续讲 Buddy System 时会详细介绍flags
用来标记 Zone 的各种信息
因为 Zone
被CPU访问地非常频繁,为了提升访问效率,整个数据结构要求对齐CPU的L1缓存。同时整个数据结构被 ZONE_PADDING
分割成了几部分,目的是为了将用于同一目的的字段聚合在同一个缓存 Line 中。
由于 Zone 是根据内存的用途进行划分的,所以 Zone 也是系统进行内存分配的直接来源,内核最底层的内存管理系统Buddy System就是基于Zone来对内存进行分配与释放的。
3.2.1.4 Node
在介绍调度器的负载均衡时我们讨论过不同的CPU拓扑结构,其主要区别体现在不同CPU对内存的访问方式上,在NUMA(Non-uniform Memory Access) 架构下,每个CPU集群都有一个自己的本地内存,每个这样的本地内存在内核中被叫着一个Node, 使用数据结构 pglist_data
来表示,该数据结构定义如下:
/* file: include/linux/mmzone.h */
/*
* On NUMA machines, each NUMA node would have a pg_data_t to describe
* it's memory layout. On UMA machines there is a single pglist_data which
* describes the whole memory.
*
* Memory statistics and page replacement data structures are maintained on a
* per-zone basis.
*/
typedef struct pglist_data {
/*
* node_zones contains just the zones for THIS node. Not all of the
* zones may be populated, but it is the full list. It is referenced by
* this node's node_zonelists as well as other node's node_zonelists.
*/
struct zone node_zones[MAX_NR_ZONES];
/*
* node_zonelists contains references to all zones in all nodes.
* Generally the first zones will be references to this node's
* node_zones.
*/
struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
/*
* Must be held any time you expect node_start_pfn,
* node_present_pages, node_spanned_pages or nr_zones to stay constant.
* Also synchronizes pgdat->first_deferred_pfn during deferred page
* init.
*
* pgdat_resize_lock() and pgdat_resize_unlock() are provided to
* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
* or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
*
* Nests above zone->lock and zone->span_seqlock
*/
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_highest_zoneidx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif
/*
* This is a per-node reserve of pages that are not available
* to userspace allocations.
*/
unsigned long totalreserve_pages;
#ifdef CONFIG_NUMA
/*
* node reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* If memory initialisation on large machines is deferred then this
* is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split deferred_split_queue;
#endif
/* Fields commonly accessed by the page reclaim scanner */
/*
* NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
*
* Use mem_cgroup_lruvec() to look up lruvecs.
*/
struct lruvec __lruvec;
unsigned long flags;
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
重要的字段包括:
struct zone node_zones[MAX_NR_ZONES];
该 node 划分出来的分区struct zonelist node_zonelists[MAX_ZONELISTS];
用来串起所有 node 中所有 zone, 由于 Zone 是内存分配时直接打交道的对象,当当前node没有足够的内存时,系统需要从其他 zone 中去请求内存,这个列表就是用来方便遍历用的struct page *node_mem_map;
用来记录本 node 的所有 page, 后续介绍物理内存模型时会涉及node_start_pfn
本 node 的起始 PFN(Page Frame Number)node_present_pages
,node_spanned_pages
该 node 中对应的 presentpages 与 spannedpages, 具体意义参见Zone
中的对应字段flags
记录 node 的各种标记
在 UMA(Uniform Memory Access)架构下,系统所有的内存都使用一个 node 来表示。node, zone, page 是内存管理模块最核心的三个数据结构,三者的基本机构如下:

Last updated
Was this helpful?