Kernel内存管理
196082 慢慢好起来

前言

其实这一篇打算的是写syz-fuzzer部分的源码分析或者是Linux Rootkit这两篇中的一篇,但是在前一篇文章中较为详细的分析了slub的分配流程,加上也有几位师傅说我的博客缺少基础知识,这也就让我想要不把也这篇文章给写了。这里说一下为什么我不愿意写这些基础的原因,在学校期间我不愿意写的主要原因是我想要不断的学习新东西(可以看到我前面玩过什么qemu,chrome之类的),随后会过一遍基础但是写下来就会有点费力不讨好的感觉。实习之后为什么不写的主要原因是,写基础是不能当作这一周干的事写进周报的,可以看到在上班期间我复现了很多CVE其实在那么多篇中也穿插了许多的基础知识,对资本家的无声抵抗了属于是

当然,虽然前面理由那么多但是最严重的一个问题还是,没人看我博客😭,这也是让我一直犯懒的原因(不过留着自己看也是很好的)。

struct page

在前面的很多文章中都提到过page结构体,但是并没有拿出来详细解释过,这里也详细的阐述一下吧。在 向pipe_buffer说yes! 文章中我们是使用off by null导致两个pipe_buffer->page指针指向了同一个page结构体,而page结构体在 Linux Kernel 中用于表示一个物理页框同样每个物理页框也会对应一个page结构体,正是因为前面的对应关系存在我们才能够让后面的pipe_buffer结构体去占领对应的物理页框。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
struct page {
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
/*
* Five words (20/40 bytes) are available in this union.
* WARNING: bit 0 of the first word is used for PageTail(). That
* means the other users of this union MUST NOT use the bit to
* avoid collision and false-positive PageTail().
*/
union {
struct { /* Page cache and anonymous pages */
/**
* @lru: Pageout list, eg. active_list protected by
* lruvec->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
union {
struct list_head lru;

/* Or, for the Unevictable "LRU list" slot */
struct {
/* Always even, to negate PageTail */
void *__filler;
/* Count page's or folio's mlocks */
unsigned int mlock_count;
};

/* Or, free page */
struct list_head buddy_list;
struct list_head pcp_list;
};
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
union {
pgoff_t index; /* Our offset within mapping. */
unsigned long share; /* share count for fsdax */
};
/**
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
* Used for swp_entry_t if PageSwapCache.
* Indicates order in the buddy system if PageBuddy.
*/
unsigned long private;
};
struct { /* page_pool used by netstack */
/**
* @pp_magic: magic value to avoid recycling non
* page_pool allocated pages.
*/
unsigned long pp_magic;
struct page_pool *pp;
unsigned long _pp_mapping_pad;
unsigned long dma_addr;
union {
/**
* dma_addr_upper: might require a 64-bit
* value on 32-bit architectures.
*/
unsigned long dma_addr_upper;
/**
* For frag page support, not supported in
* 32-bit architectures with 64-bit DMA.
*/
atomic_long_t pp_frag_count;
};
};
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
};
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
void *zone_device_data;
/*
* ZONE_DEVICE private pages are counted as being
* mapped so the next 3 words hold the mapping, index,
* and private fields from the source anonymous or
* page cache page while the page is migrated to device
* private memory.
* ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
* use the mapping, index, and private fields when
* pmem backed DAX files are mapped.
*/
};

/** @rcu_head: You can use this to free a page by RCU. */
struct rcu_head rcu_head;
};

union { /* This union is 4 bytes in size. */
/*
* If the page can be mapped to userspace, encodes the number
* of times this page is referenced by a page table.
*/
atomic_t _mapcount;

/*
* If the page is neither PageSlab nor mappable to userspace,
* the value stored here may help determine what this page
* is used for. See page-flags.h for a list of page types
* which are currently stored here.
*/
unsigned int page_type;
};

/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
atomic_t _refcount;

#ifdef CONFIG_MEMCG
unsigned long memcg_data;
#endif

/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef CONFIG_KMSAN
/*
* KMSAN metadata for this page:
* - shadow page: every bit indicates whether the corresponding
* bit of the original page is initialized (0) or not (1);
* - origin page: every 4 bytes contain an id of the stack trace
* where the uninitialized value was created.
*/
struct page *kmsan_shadow;
struct page *kmsan_origin;
#endif

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
} _struct_page_alignment;

在内核中page结构体的定义如上,这里不会将全部成员的含义进行解释只会解释较为重要或是后文中需要的。

flags:标志位

这个成员的含义很明显,用于表示该页处于什么样的状态

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
PG_writeback, /* Page is under writeback */
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
PG_head, /* Must be in bit 6 */
PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
PG_active,
PG_workingset,
PG_error,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
PG_reserved,
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
#ifdef CONFIG_MMU
PG_mlocked, /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PG_uncached, /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
PG_arch_2,
PG_arch_3,
#endif
__NR_PAGEFLAGS,

PG_readahead = PG_reclaim,

/*
* Depending on the way an anonymous folio can be mapped into a page
* table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped
* THP), PG_anon_exclusive may be set only for the head page or for
* tail pages of an anonymous folio. For now, we only expect it to be
* set on tail pages for PTE-mapped THP.
*/
PG_anon_exclusive = PG_mappedtodisk,

/* Filesystems */
PG_checked = PG_owner_priv_1,

/* SwapBacked */
PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */

/* Two page bits are conscripted by FS-Cache to maintain local caching
* state. These bits are set on pages belonging to the netfs's inodes
* when those inodes are being locally cached.
*/
PG_fscache = PG_private_2, /* page backed by cache */

/* XEN */
/* Pinned in Xen as a read-only pagetable page. */
PG_pinned = PG_owner_priv_1,
/* Pinned as part of domain save (see xen_mm_pin_all()). */
PG_savepinned = PG_dirty,
/* Has a grant mapping of another (foreign) domain's page. */
PG_foreign = PG_owner_priv_1,
/* Remapped by swiotlb-xen. */
PG_xen_remapped = PG_owner_priv_1,

/* non-lru isolated movable page */
PG_isolated = PG_reclaim,

/* Only valid for buddy pages. Used to track pages that are reported */
PG_reported = PG_uptodate,

#ifdef CONFIG_MEMORY_HOTPLUG
/* For self-hosted memmap pages */
PG_vmemmap_self_hosted = PG_owner_priv_1,
#endif

/*
* Flags only valid for compound pages. Stored in first tail page's
* flags word. Cannot use the first 8 flags or any flag marked as
* PF_ANY.
*/

/* At least one page in this folio has the hwpoison flag set */
PG_has_hwpoisoned = PG_error,
PG_hugetlb = PG_active,
PG_large_rmappable = PG_workingset, /* anon or file-backed */
};

上面的枚举类型变量就对应了一个页的不同状态。

PG_locked:表示该页已被上锁,说明此时该页正在被使用

PG_referenced:该页刚刚被访问过,该标志位与 PG_reclaim 标志位共同被用于匿名与文件备份缓存的页面回收

PG_uptodate:该页处在最新状态,当对该页完成一次读取时,该页便变更为 up-to-date 状态,除非发生了磁盘 IO 错误

PG_dirty:该页为脏页,即该页的内容已被修改,应当尽快将内容写回磁盘上

PG_lru:该页处在一个 LRU 链表上

PG_active:该页面位于活跃 lru 链表中

PG_workingset:该页位于某个进程的工作集(即一个进程同时使用的内存数量,例如一个进程可能分配了114514MB内存,但是在同一时刻只使用其中的1919MB,这就是工作集)中

PG_waiters:有进程在等待该页面

PG_error:该页在 I/O 过程中出现了差错

PG_slab:该页由 slab 使用

PG_owner_priv_1:该页由其所有者使用,若是作为 pagecache 页面,则可能是被文件系统使用

PG_arch_1:该标志位与体系结构相关联

PG_reserved:该页被保留,不能够被 swap out(内核会将不活跃的页交换到磁盘上)

PG_private :该页拥有私有数据(private 字段)

PG_writeback:该页正在被写到磁盘上

PG_head:在内核中有时需要将多个页组成一个 compound pages,而设置该状态时表明该页是 compound pages 的第一个页

PG_mappedtodisk:该页被映射到硬盘中

PG_reclaim:该页可以被回收

PG_swapbacked:该页的后备存储器为 swap/RAM

PG_unevictable:该页不可被回收(被锁),且会出现在 LRU_UNEVICTABLE 链表中

PG_mlocked:该页被对应的 vma 上锁(通常是系统调用 mlock)

PG_uncached:该页被设置为不可缓存

PG_hwpoison:硬件相关的标志位

PG_arch_2:64位下的体系结构相关标志位

flags标志位还存在复用的情况

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/*
* page->flags layout:
*
* There are five possibilities for how page->flags get laid out. The first
* pair is for the normal case without sparsemem. The second pair is for
* sparsemem when there is plenty of space for node and section information.
* The last is when there is insufficient space in page->flags and a separate
* lookup is necessary.
*
* No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
* " plus space for last_cpupid: | NODE | ZONE | LAST_CPUPID ... | FLAGS |
* classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
* " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS |
* classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
*/

flags字段会为了节省内存与其他结构共用空间,具体划分形式与内核配置的内存模型有关,可以看到上面讲起话氛围了五种,其实是三大种。

第一种:非sparse内存模式

如上图所示,低位用作该 page 的 flag,高位分别标识其归属的 zone, node id(非 NUMA 系统中为0),中间剩余的位保留。

第二种:sparse内存模式

如上图所示,相比起第一种形式多了一个 SECTION 字段标识其归属的 mem_section

第三种:没有Node的sparse内存模式

即在第二种的基础上去掉node字段。这一模式主要适用于非NUMA系统,在这种情况下取消了NODE结构。

lru:链表节点

lru大家应该都是较为熟悉的,在操作系统课上学习过页面置换算法。

如上图所示,page结构体由lru组织成链表。

slab:相关结构体

在低版本的page结构体中专门有一个匿名变量用来存放于slab相关的成员的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
struct {	/* slab, slob and slub */
union {
struct list_head slab_list;
struct { /* Partial pages */
struct page *next;
#ifdef CONFIG_64BIT
int pages; /* Nr of pages left */
int pobjects; /* Approximate count */
#else
short int pages;
short int pobjects;
#endif
};
};
struct kmem_cache *slab_cache; /* not slob */
/* Double-word boundary */
void *freelist; /* first free object */
union {
void *s_mem; /* slab: first object */
unsigned long counters; /* SLUB */
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
};
};

关于slab的内容就不多赘述了,可以看前一篇文章 Pspray: Timing Side-Channel based Linux Kernel Heap Exploitation Technique

_mapcount:映射计数

记录该页被页表映射的次数,每个进程有其独立的页表,故可以理解为该值记录了该页被多少个进程共享,其初始值为 -1。由于这是一个联合体,若是该页没有被映射到用户空间,则为 page_type 字段

_refcount:引用计数

引用计数相比应该都很熟悉的,该字段用作该页面在内核中的引用计数器,初始时页面为空闲状态,该计数器为 0,每当该页面被分配引用时计数器会 + 1,被其他页面进行引用时也会 + 1。当引用计数器为 0 时表示该页面为空闲状态或即将要被释放,若大于 0 则表示正在被使用,暂时不会释放。

内核中提供了两个函数 get_page()put_page() 来进行引用计数的增减

1
2
3
4
5
6
7
8
9
10
11
12
static inline void put_page(struct page *page)
{
struct folio *folio = page_folio(page);

/*
* For some devmap managed pages we need to catch refcount transition
* from 2 to 1:
*/
if (put_devmap_managed_page(&folio->page))
return;
folio_put(folio);
}

会调用folio_put函数来检测是否引用计数为0然后是否释放该页。

virtual:虚拟地址

该字段为该物理页框对应的的虚拟地址

每一个 struct page 对应一个物理页框,那么这个 virtual 字段其实就是上图的反向映射。

不同内存模型下的struct page存储方式

Linux 提供了上图中的三种内存模型,内存模型在编译时就会被确定下来,目前最为常用的是Sparse Memory模型。

Flat Memory

平滑内存模型。物理内存地址连续,有一个全局变量 mem_map 由一个大的struct page数组直接对应现有的物理内存

Discontiguous Memory

非连续性内存模型,对于每一段连续的物理内存,都有一个 pglist_data 结构体进行对应,其成员 node_mem_map 为一个struct page指针,指向一个 page 结构体数组,由该结构体对应到该段连续物理内存。有一个全局变量 node_data 为一个pglist_data指针数组,其中存放着指向每一个pglist_data的指针,该数组的大小为 MAX_NUMNODES。主要针对内存中存在空洞的情况。

Sparse Memory

离散内存模型,在一个mem_section结构体中存在一个 section_mem_map 成员指向一个struct page数组对应一段连续的物理内存,即将内存按照 section 为单位进行分段。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
struct mem_section {
/*
* This is, logically, a pointer to an array of struct
* pages. However, it is stored with some other magic.
* (see sparse.c::sparse_init_one_section())
*
* Additionally during early boot we encode node id of
* the location of the section here to guide allocation.
* (see sparse.c::memory_present())
*
* Making it a UL at least makes someone do a cast
* before using it wrong.
*/
unsigned long section_mem_map;

struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
/*
* If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
* section. (see page_ext.h about this.)
*/
struct page_ext *page_ext;
unsigned long pad;
#endif
/*
* WARNING: mem_section must be a power-of-2 in size for the
* calculation and use of SECTION_ROOT_MASK to make sense.
*/
};

存在一个全局指针数组 mem_section (与结构体同名)存放所有的 mem_section 指针,指向理论上支持的内存空间,每个 section 对应的物理内存不一定存在,若不存在则此时该 section 的指针为 NULL。

1
2
3
4
5
6
#ifdef CONFIG_SPARSEMEM_EXTREME
struct mem_section **mem_section;
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
____cacheline_internodealigned_in_smp;
#endif

若未开启CONFIG_SPARSEMEM_EXTREME编译选项则 mem_section 为一个常规的二维数组,否则为一个二级指针,其所指向空间内存动态分配。

这种模型支持内存的热拔插。

这里需要注意到的是,在struct mem_section结构体中的section_mem_map成员的定义并非是struct page*而是unsigned long类型的,其记录的其实是 page 数组与PFN之间的差值section_mem_map = page_arr_addr - PFN_start

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#elif defined(CONFIG_SPARSEMEM)
/*
* Note: section's mem_map is encoded to reflect its start_pfn.
* section[i].section_mem_map == mem_map's address - start_pfn;
*/
#define __page_to_pfn(pg) \
({ const struct page *__pg = (pg); \
int __sec = page_to_section(__pg); \
(unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \
})

#define __pfn_to_page(pfn) \
({ unsigned long __pfn = (pfn); \
struct mem_section *__sec = __pfn_to_section(__pfn); \
__section_mem_map_addr(__sec) + __pfn; \
})
#endif /* CONFIG_FLATMEM/SPARSEMEM */

在内核中也提供了 PFN 和 page 之间转化的两个宏定义,这里详细分析一下。

1
2
3
4
5
#define __page_to_pfn(pg)					\
({ const struct page *__pg = (pg); \
int __sec = page_to_section(__pg); \
(unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \
})

首先看page到PFN的过程

1
2
3
4
static inline unsigned long page_to_section(const struct page *page)
{
return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}

首先会使用page_to_setion函数,其函数内部实现是通过page->flags获取到page所属的section标号。

1
2
3
4
5
6
7
8
9
10
11
12
13
static inline struct mem_section *__nr_to_section(unsigned long nr)
{
unsigned long root = SECTION_NR_TO_ROOT(nr);

if (unlikely(root >= NR_SECTION_ROOTS))
return NULL;

#ifdef CONFIG_SPARSEMEM_EXTREME
if (!mem_section || !mem_section[root])
return NULL;
#endif
return &mem_section[root][nr & SECTION_ROOT_MASK];
}

随后通过调用__nr_to_section函数获得对应mem_section结构体的地址

1
2
3
4
5
6
7
#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT 1
#endif

#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)

这里默认开启CONFIG_SPARSEMEM_EXTREME,此时SECTIONS_PER_ROOT含义为一页中struct mem_section的数量,所以SECTION_NR_TO_ROOT宏得到的是对应的页下表,最后通过nr & SECTION_ROOT_MASK获得在该页下的mem_section数组下标。

1
2
3
4
5
6
static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
unsigned long map = section->section_mem_map;
map &= SECTION_MAP_MASK;
return (struct page *)map;
}

之后通过__section_mem_map_addr获取mem_section结构体中section_mem_map成员,最后与page结构体的地址做差运算便能获得其PFN,根据前面的条件可以得到最终的运算写法其实是(page_addr - page_arr_addr) + PFN_start = PFN

接下看一下__pfn_to_page

1
2
3
4
5
#define __pfn_to_page(pfn)				\
({ unsigned long __pfn = (pfn); \
struct mem_section *__sec = __pfn_to_section(__pfn); \
__section_mem_map_addr(__sec) + __pfn; \
})
1
2
3
4
5
6
7
8
9
10
11
#define PFN_SECTION_SHIFT	(SECTION_SIZE_BITS - PAGE_SHIFT)

static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
return pfn >> PFN_SECTION_SHIFT;
}

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
return __nr_to_section(pfn_to_section_nr(pfn));
}

这里首先调用__pfn_to_section函数来获得PFN对应的mem_section。首先通过调用pfn_to_section_nr函数获取对应section的索引。可以看到函数中使用到了PFN_SECTION_SHIFT,而其定义为SECTION_SIZE_BITS减去PAGE_SHIFT,其中SECTION_SIZE_BITS的含义为一个section所占的位数,后面则是很熟悉的一个页的位数,所以PFN_SECTION_SHIFT含义为一个section中页的数量。

pfn_to_section_nr函数内部实现为使用页框号向右移位一个section中页的数量最终得到的是当前页在section中的标号。

随后调用__nr_to_section函数获取对应mem_section地址,最后使用__section_mem_map_addr函数获取到section_mem_map成员再与页框号做加法,根据前面的条件这里最终运算的写法可以写作(PFN - PFN_start) + page_arr_addr = page_addr

最后,基于Sparse Memory 内存模型上引入了 vmemmap 的概念,是目前 Linux 最常用的内存模型之一。

在开启了vmemmap之后,所有的mem_section中的 page 都抽象到一个虚拟数组vmemmap中,这样在进行struct page *和 pfn 转换时,直接使用vmemmap数组即可。

1
2
3
4
5
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)

/* memmap is virtually contiguous. */
#define __pfn_to_page(pfn) (vmemmap + (pfn))
#define __page_to_pfn(page) (unsigned long)((page) - vmemmap)

struct zone

在 Linux 下将一个节点内不同用途的内存区域划分为不同的区即zone,对应结构体 struct zone

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
struct zone {
/* Read-mostly fields */

/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long _watermark[NR_WMARK];
unsigned long watermark_boost;

unsigned long nr_reserved_highatomic;

/*
* We don't know if the memory that we're going to allocate will be
* freeable or/and it will be released eventually, so to avoid totally
* wasting several GB of ram we must reserve some of the lower zone
* memory (otherwise we risk to run OOM on the lower zones despite
* there being tons of freeable ram on the higher zones). This array is
* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
* changes.
*/
long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat;
struct per_cpu_pages __percpu *per_cpu_pageset;
struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
* the high and batch values are copied to individual pagesets for
* faster access
*/
int pageset_high;
int pageset_batch;

#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;

/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* present_early_pages is present pages existing within the zone
* located on memory available since early boot, excluding hotplugged
* memory.
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* cma pages is present pages that are assigned for CMA use
* (MIGRATE_CMA).
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/done(). Any reader who can't tolerant drift of
* present_pages should use get_online_mems() to get a stable value.
*/
atomic_long_t managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;
#if defined(CONFIG_MEMORY_HOTPLUG)
unsigned long present_early_pages;
#endif
#ifdef CONFIG_CMA
unsigned long cma_pages;
#endif

const char *name;

#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* freepage counting problem due to racy retrieving migratetype
* of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif

int initialized;

/* Write-intensive fields used from the page allocator */
CACHELINE_PADDING(_pad1_);

/* free areas of different sizes */
struct free_area free_area[MAX_ORDER + 1];

#ifdef CONFIG_UNACCEPTED_MEMORY
/* Pages to be accepted. All pages on the list are MAX_ORDER */
struct list_head unaccepted_pages;
#endif

/* zone flags, see below */
unsigned long flags;

/* Primarily protects free_area */
spinlock_t lock;

/* Write-intensive fields used by compaction and vmstats. */
CACHELINE_PADDING(_pad2_);

/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC];
unsigned long compact_init_migrate_pfn;
unsigned long compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
* compact_order_failed is the minimum compaction failed order.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif

bool contiguous;

CACHELINE_PADDING(_pad3_);
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
} ____cacheline_internodealigned_in_smp;

_watermark:水位线

每一个 zone 都有着其对应的三档“水位线”: WMARK_MINWMARK_LOWWMARK_HIGH,存放在 _watermark 数组中,在进行内存分配时,分配器(例如 buddy system)会根据当前 zone 中空余内存所处在的“水位线”来判断当前的内存状况。在进行内存分配的时候,如果分配器(比如buddy allocator)发现当前空余内存的值低于”low”但高于”min”,说明现在内存面临一定的压力,那么在此次内存分配完成后,kswapd将被唤醒,以执行内存回收操作。在这种情况下,内存分配虽然会触发内存回收,但不存在被内存回收所阻塞的问题,两者的执行关系是异步的(之前的kswapd实现是周期性触发)。”low”可以被认为是一个警戒水位线,而”high”则是一个安全的水位线。

lowmem_reserve:zone自身的保留内存

在进行内存分配时,若当前的 zone 没有足够的内存了,则会向下一个 zone 索要内存,那么这就存在一个问题:来自 higher zones 的内存分配请求可能耗尽 lower zones 的内存,但这样分配的内存未必是可释放的(freeable),亦或者/且最终不一定会被释放,这有可能导致 lower zones 的内存提前耗尽,而 higher zones 却仍保留有大量的内存

为了避免这样的一种情况的发生,lowmem_reserve 字段用以声明为该 zone 保留的内存,这一块内存别的 zone 是不能动的

node:NUMA中标识所属node

只有在CONFIG_NUMA即开启NUMA时该字段才会被启用,用来标识该zone所属的node。

zone_pgdat:zone 所属的 pglist_data 节点

该字段用以标识该 zone 所属的 pglist_data 节点

per_cpu_pageset:zone 为每个 CPU 划分一个独立的”页面仓库“

众所周知伴随着多 CPU 的引入,条件竞争就是一个不可忽视的问题,当多个 CPU 需要对一个 zone 进行操作时,频繁的加锁/解锁操作则毫无疑问会造成大量的开销,因此 zone 引入了 per_cpu_pageset 结构体成员,即为每一个 CPU 都准备一个单独的页面仓库,因此其实现方式是实现为一个 percpu 变量。在一开始时buddy system会将页面放置到各个 CPU 自己独立的页面仓库中,需要进行分配时 CPU 优先从自己的页面仓库中分配。

1
2
3
4
5
6
7
8
9
10
11
12
13
struct per_cpu_pages {
spinlock_t lock; /* Protects lists field */
int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
short free_factor; /* batch scaling factor during free */
#ifdef CONFIG_NUMA
short expire; /* When 0, remote pagesets are drained */
#endif

/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[NR_PCP_LISTS];
} ____cacheline_aligned_in_smp;

该结构体会被存放在每个 CPU 自己独立的 .data..percpu 段中。

vm_stat:统计数据

该数组用来进行数据统计,按照枚举类型 zone_stat_item 分为多个数组,以统计不同类型的数据(比如说 NR_FREE_PAGES 表示 zone 中的空闲页面1数量):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
NR_ZONE_ACTIVE_ANON,
NR_ZONE_INACTIVE_FILE,
NR_ZONE_ACTIVE_FILE,
NR_ZONE_UNEVICTABLE,
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
/* Second 128 byte cacheline */
NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
NR_FREE_CMA_PAGES,
#ifdef CONFIG_UNACCEPTED_MEMORY
NR_UNACCEPTED,
#endif
NR_VM_ZONE_STAT_ITEMS };

free_area:buddy system 按照 order 管理的页面

该字段用以存储buddy system按照 order 管理的页面,为一个 free_area 结构体数组,该结构体定义如下:

1
2
3
4
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;
};

在 free_area 中存放的页面通过自身的相应字段连接成双向链表结构

free_area 中并非只有一个双向链表,而是按照不同的“迁移类型”(migrate type)进行分开存放,这是由于页面迁移机制的存在。

后续成员

zone_start_pfn:zone 的起始物理PFN

该字段用以标识该 zone 的起始物理页帧号。

spanned_pages:zone 对应的内存区域中的 pages 总数(包括空洞)

该字段用以标识该 zone 对应的内存区域中的 pages 总数, 包括空洞

present_pages: zone 中存在的物理页框数

该字段用以标识 zone 中实际存在的物理页框数

managed_pages:zone 中 buddy system 管理的页面数量

该字段用以标识 zone 中 buddy system 管理的页面数量

flags:标志位

该 zone 的标志位,用以标识其所处的状态

页面迁移机制

页面迁移主要用以解决内核空间中的碎片问题,在长期的运行之后内存当中空闲页面的分布可能是零散的,这便导致了内核有可能无法映射到足够大的连续内存,因此需要进行页面迁移即将旧的页面迁移到新的位置,相信大家在操作系统课都学习过的。

但并非所有的页面都是能够随意迁移的,因此我们在 buddy system 当中还需要将页面按照迁移类型进行分类。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
/*
* MIGRATE_CMA migration type is designed to mimic the way
* ZONE_MOVABLE works. Only movable pages can be allocated
* from MIGRATE_CMA pageblocks and page allocator never
* implicitly change migration type of MIGRATE_CMA pageblock.
*
* The way to use it is to change migratetype of a range of
* pageblocks to MIGRATE_CMA which can be done by
* __free_pageblock_cma() function.
*/
MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
MIGRATE_ISOLATE, /* can't allocate from here */
#endif
MIGRATE_TYPES
};

迁移类型由上面的枚举类型定义。

  • MIGRATE_UNMOVABLE:这类型页面在内存当中有着固定的位置,不能移动
  • MIGRATE_MOVABLE:这类页面可以随意移动,例如用户空间的页面,我们只需要复制数据后改变页表映射即可
  • MIGRATE_RECLAIMABLE:这类页面不能直接移动,但是可以删除,例如映射自文件的页
  • MIGRATE_PCPTYPES:per_cpu_pageset,即每 CPU 页帧缓存,其迁移仅限于同一节点内
  • MIGRATE_ISOLATE:不能从该链表分配页面,该链表用于跨 NUMA 节点进行页面移动,将页面移动到使用该页面最为频繁的 CPU 所处节点
  • MIGRATE_TYPES:表示迁移类型的数目,并不存在这一链表

zone分类

在 Linux kernel 当中,我们根据内存区段的不同用途,将其划分为不同的 zone

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
enum zone_type {
/*
* ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
* to DMA to all of the addressable memory (ZONE_NORMAL).
* On architectures where this area covers the whole 32 bit address
* space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
* DMA addressing constraints. This distinction is important as a 32bit
* DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
* platforms may need both zones as they support peripherals with
* different DMA addressing limitations.
*/
#ifdef CONFIG_ZONE_DMA
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
/*
* ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
* movable pages with few exceptional cases described below. Main use
* cases for ZONE_MOVABLE are to make memory offlining/unplug more
* likely to succeed, and to locally limit unmovable allocations - e.g.,
* to increase the number of THP/huge pages. Notable special cases are:
*
* 1. Pinned pages: (long-term) pinning of movable pages might
* essentially turn such pages unmovable. Therefore, we do not allow
* pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
* faulted, they come from the right zone right away. However, it is
* still possible that address space already has pages in
* ZONE_MOVABLE at the time when pages are pinned (i.e. user has
* touches that memory before pinning). In such case we migrate them
* to a different zone. When migration fails - pinning fails.
* 2. memblock allocations: kernelcore/movablecore setups might create
* situations where ZONE_MOVABLE contains unmovable allocations
* after boot. Memory offlining and allocations fail early.
* 3. Memory holes: kernelcore/movablecore setups might create very rare
* situations where ZONE_MOVABLE contains memory holes after boot,
* for example, if we have sections that are only partially
* populated. Memory offlining and allocations fail early.
* 4. PG_hwpoison pages: while poisoned pages can be skipped during
* memory offlining, such pages cannot be allocated.
* 5. Unmovable PG_offline pages: in paravirtualized environments,
* hotplugged memory blocks might only partially be managed by the
* buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
* parts not manged by the buddy are unmovable PG_offline pages. In
* some cases (virtio-mem), such pages can be skipped during
* memory offlining, however, cannot be moved/allocated. These
* techniques might use alloc_contig_range() to hide previously
* exposed pages from the buddy again (e.g., to implement some sort
* of memory unplug in virtio-mem).
* 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
* situations where ZERO_PAGE(0) which is allocated differently
* on different platforms may end up in a movable zone. ZERO_PAGE(0)
* cannot be migrated.
* 7. Memory-hotplug: when using memmap_on_memory and onlining the
* memory to the MOVABLE zone, the vmemmap pages are also placed in
* such zone. Such pages cannot be really moved around as they are
* self-stored in the range, but they are treated as movable when
* the range they describe is about to be offlined.
*
* In general, no unmovable allocations that degrade memory offlining
* should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
* have to expect that migrating pages in ZONE_MOVABLE can fail (even
* if has_unmovable_pages() states that there are no unmovable pages,
* there can be false negatives).
*/
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES

};
  • ZONE_DMA:直接内存访问区域,使用 DMA 内存情况 :有些设备架构比较老 , 无法直接访问整个内存 , 需要使用 DMA 直接内存访问区域 ;
  • ZONE_DMA32:对应的是64位系统 , DMA32区域。可以支持两种设备的访问,只能直接访问 16 MB 以下的内存设备;只能直接访问 4 GB 以下的内存设备。
  • ZONE_NORMAL:普通内存区域,该内存区域 可以 直接映射到 “ 内核虚拟地址空间 “。
  • ZONE_HIGHMEM:高端内存区域,这是32位架构中的概念,DMADMA32 又称为”低端内存区域”。
  • ZONE_MOVABLE:”可移动区域” , 这是为了防止”内存碎片”的伪内存区。
  • ZONE_DEVICE:”设备区域”, 这是为了支持”内存热插拔”而设置的内存区域 , 每个设备区域使用 zone 结构体表示。

大致总结一下,下面先看X86-32

Type Start address End address
ZONE_DMA 0MB 16MB
ZONE_NORMAL 16MB 896MB
ZONE_HIGHMEM 896MB

X86-64

Type Start address End address
ZONE_DMA 0MB 16MB
ZONE_DMA32 16MB 4GB
ZONE_NORMAL 4GB

struct pglist_data

zone上一层则是节点,Linux将内存控制器作为节点划分的依据,对于 UMA 架构而言只有一个节点,而对于 NUMA 架构而言通常有多个节点,对于同一个内存控制器下的 CPU 而言其对应的节点称之为本地内存,不同处理器之间通过总线进行进一步的连接。

一个节点使用pglist_data结构进行描述

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
typedef struct pglist_data {
/*
* node_zones contains just the zones for THIS node. Not all of the
* zones may be populated, but it is the full list. It is referenced by
* this node's node_zonelists as well as other node's node_zonelists.
*/
struct zone node_zones[MAX_NR_ZONES];

/*
* node_zonelists contains references to all zones in all nodes.
* Generally the first zones will be references to this node's
* node_zones.
*/
struct zonelist node_zonelists[MAX_ZONELISTS];

int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLATMEM /* means !SPARSEMEM */
struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
/*
* Must be held any time you expect node_start_pfn,
* node_present_pages, node_spanned_pages or nr_zones to stay constant.
* Also synchronizes pgdat->first_deferred_pfn during deferred page
* init.
*
* pgdat_resize_lock() and pgdat_resize_unlock() are provided to
* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
* or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
*
* Nests above zone->lock and zone->span_seqlock
*/
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;

/* workqueues for throttling reclaim for different reasons. */
wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];

atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
unsigned long nr_reclaim_start; /* nr pages written while throttled
* when throttling started. */
#ifdef CONFIG_MEMORY_HOTPLUG
struct mutex kswapd_lock;
#endif
struct task_struct *kswapd; /* Protected by kswapd_lock */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;

int kswapd_failures; /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_highest_zoneidx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
bool proactive_compact_trigger;
#endif
/*
* This is a per-node reserve of pages that are not available
* to userspace allocations.
*/
unsigned long totalreserve_pages;

#ifdef CONFIG_NUMA
/*
* node reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */

/* Write-intensive fields used by page reclaim */
CACHELINE_PADDING(_pad1_);

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* If memory initialisation on large machines is deferred then this
* is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_NUMA_BALANCING
/* start time in ms of current promote rate limit period */
unsigned int nbp_rl_start;
/* number of promote candidate pages at start time of current rate limit period */
unsigned long nbp_rl_nr_cand;
/* promote threshold in ms */
unsigned int nbp_threshold;
/* start time in ms of current promote threshold adjustment period */
unsigned int nbp_th_start;
/*
* number of promote candidate pages at start time of current promote
* threshold adjustment period
*/
unsigned long nbp_th_nr_cand;
#endif
/* Fields commonly accessed by the page reclaim scanner */

/*
* NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
*
* Use mem_cgroup_lruvec() to look up lruvecs.
*/
struct lruvec __lruvec;

unsigned long flags;

#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
struct lru_gen_mm_walk mm_walk;
/* lru_gen_folio list */
struct lru_gen_memcg memcg_lru;
#endif

CACHELINE_PADDING(_pad2_);

/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
#ifdef CONFIG_NUMA
struct memory_tier __rcu *memtier;
#endif
#ifdef CONFIG_MEMORY_FAILURE
struct memory_failure_stats mf_stats;
#endif
} pg_data_t;

成员描述

  • node_zones: node 的 zone 列表,节点中最重要的字段 node_zones 作为一个 zone 结构体数组 记录了本节点上所有的 zone,其中有效的 zone 的个数由节点结构体的 nr_zones 字段指出。
  • node_zonelists: 内存分配时备用 zone 的搜索顺序,该字段用以确定内存分配时对备用的 zone 的搜索顺序,在本节点常规内存分配失败时会沿着这个数组进行搜索,其中包含的 zone 可以是非本节点的 zone。

其结构体定义如下:

1
2
3
4
5
6
7
8
struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};

struct zonelist {
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
  • node_start_pfn: node 的起始页框标号,该字段记录了该节点上的物理内存起始页框标号。
  • node_present_pages: node中物理页的总数量。
  • node_spanned_pages: 该字段记录了节点上包括空洞在内的页帧为单位的该节点内存的总长度。
  • node_id: 该字段记录了该节点在系统中的标号,从 0 开始。

node存储方式

1
2
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);

在内核中定一个名为node_datapglist_data数组,该数组保存着系统中的所有节点。

1
2
3
4
5
6
7
8
9
➜  ~ numactl --hardware
available: 1 nodes (0)
node 0 cpus: 0 1
node 0 size: 3904 MB
node 0 free: 313 MB
node distances:
node 0
0: 10
➜ ~

在Linux中可以用上述命令查看存在多少node。

node状态

1
2
3
4
5
6
7
8
9
10
11
12
13
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_POSSIBLE] = NODE_MASK_ALL,
[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
[N_MEMORY] = { { [0] = 1UL } },
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
EXPORT_SYMBOL(node_states);

在Linux中存在上面这样一个全局数组用以表示标识对应标号的节点状态。

1
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;

这里nodemask_t是一个位图类型。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
enum node_states {
N_POSSIBLE, /* The node could become online at some point */
N_ONLINE, /* The node is online */
N_NORMAL_MEMORY, /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
N_HIGH_MEMORY, /* The node has regular or high memory */
#else
N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
N_MEMORY, /* The node has memory(regular, high, movable) */
N_CPU, /* The node has one or more cpus */
N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */
NR_NODE_STATES
};

这个状态由一个枚举类型 node_states 定义。

buddy system中的内存组织形式

在上面简单提到了这一zone中的成员,其作用主要用于存储buddy system按照order管理的页面。

1
2
3
4
5
struct zone {
// ... ...
struct free_area free_area[MAX_ORDER + 1];
// ... ...
} ____cacheline_internodealigned_in_smp;

这里的MAX_ORDER的值为10所以其数组定义为11。

buddy system中按照空闲页面的连续大小进行分阶管理,这里的 order 的实际含义为连续的空闲页面的大小,不过单位不是页面数,而是介,即对于每个下标而言,其中所存储的页面大小为:2^order

free_area中存放的页面通过自身相应字段连接成双向链表结构,在前文中有详细介绍。这里简单说一下free_area结构体。

1
2
3
4
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;
};

这里的nr_free成员很明显就是用于记录了在当前free_area中的空闲页面块的数量,对于free_area[0]以外的free_area而言其单位并非是单个页框,而是以内存块为单位。

free_list成员则是为list_head的链表结构其通过page结构体的lru字段将page结构体连接成双向链表。

1
2
3
4
5
6
7
8
struct page {
unsigned long flags;
union {
struct {
union {
struct list_head lru;
// ... ...
} _struct_page_alignment;

page 结构体中的 lru 这一字段的类型为 struct list_head,这是内核编程中通用的双向链表结构,free_listlru链表都使用该字段将页结构体组织为双向链表,即一个页是不可能同时出现在lru链表与buddy system中的。

并且可以明确看到这里是按照迁移类型进行分类的,具体前面提到过这里不详细写了。

buddy system页分配

GFP (get free page) 标识位

在 kernel memory allocation 中我们经常能见到 gfp_t 类型,其表示分配时的标志位。

  • 内存管理区修饰符: 内存管理区修饰符主要描述从哪些内存管理区来分配内存
flag description
__GFP_DMA 从ZONE_DMA区中分配内存
__GFP_HIGNMEM 从ZONE_HIGHMEM区中分配内存
__GFP_DMA32 从ZONE_DMA32区中分配内存
__GFP_MOVABLE 内存规整时可以迁移或回收页面
  • 移动和替换修饰符: 移动和替换修饰符主要表示分配出来的页面具有的迁移属性
flag description
__GFP_RECLAIMABLE 分配的内存页面可以回收
__GFP_WRITE 申请的页面会被弄成脏页
__GFP_HARDWALL 强制使用cpuset内存分配策略
__GFP_THISNODE 在指定的节点上分配内存
__GFP_ACCOUNT kmemcg会记录分配过程
  • 水位修饰符: 与水位线相关的标志位
flag description
__GFP_ATOMIC 高优先级分配内存,分配器可以分配最低警戒水位线下的预留内存
__GFP_HIGH 分配内存的过程中不可以睡眠或执行页面回收动作
__GFP_MEMALLOC 允许访问所有的内存
__GFP_NOMEMALLOC 不允许访问最低警戒水位线下的系统预留内存
  • 页面回收修饰符: 与页面回收相关的标志位
flag description
__GFP_IO 启动物理I/O传输
__GFP_FS 允许调用底层FS文件系统。可避免分配器递归到可能已经持有锁的文件系统中, 避免死锁
__GFP_DIRECT_RECLAIM 分配内存过程中可以使用直接内存回收
__GFP_KSWAPD_RECLAIM 内存到达低水位时唤醒kswapd线程异步回收内存
__GFP_RECLAIM 表示是否可以直接内存回收或者使用kswapd线程进行回收
__GFP_RETRY_MAYFAIL 分配内存可以可能会失败,但是在申请过程中会回收一些不必要的内存,是整个系统受益
__GFP_NOFAIL 内存分配失败后无限制的重复尝试,知道分配成功
__GFP_NORETRY 直接页面回收或者内存规整后还是无法分配内存时,不启用retry反复尝试分配内存,直接返回NULL
  • 行为修饰符: 与分配时的行为相关的标志位
flag description
__GFP_NOWARN 关闭内存分配过程中的WARNING
__GFP_COMP 分配的内存页面将被组合成复合页compound page
__GFP_ZERO 返回一个全部填充为0的页面
  • 组合类型标志: 前面描述的修饰符种过于繁多,因此linux定义了一些组合的类型标志,供开发者使用
flag element description
GFP_ATOMIC __GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM 分配过程不能休眠,分配具有高优先级,可以访问系统预留内存
GFP_KERNEL __GFP_RECLAIM|__GFP_IO|__GFP_FS 分配内存时可以被阻塞(即休眠)
GFP_KERNEL_ACCOUNT GFP_KERNEL|__GFP_ACCOUNT 和GFP_KERNEL作用一样,但是分配的过程会被kmemcg记录
GFP_NOWAIT __GFP_KSWAPD_RECLAIM 分配过程中不允许因直接内存回收而导致停顿
GFP_NOIO __GFP_RECLAIM 不需要启动任何的I/O操作
GFP_NOFS __GFP_RECLAIM |__GFP_IO 不会有访问任何文件系统的操作
GFP_USER __GFP_RECLAIM|__GFP_IO|__GFP_FS|__GFP_HARDWALL 用户空间的进程分配内存
GFP_DMA __GFP_DMA 从ZONE_DMA区分配内存
GFP_DMA32 __GFP_DMA32 从ZONE_DMA32区分配内存
GFP_HIGHUSER GFP_USER|__GFP_HIGHMEM 用户进程分配内存,优先使用ZONE_HIGHMEM, 且这些页面不允许迁移
GFP_HIGHUSER_MOVABLE GFP_HIGHUSER|__GFP_MOVABLE 和GFP_HIGHUSER类似,但是页面可以迁移
GFP_TRANSHUGE_LIGHT (GFP_HIGHUSER_MOVABLE|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_NOWARN)& ~__GFP_RECLAIM 透明大页的内存分配, light表示不进行内存压缩和回收
GFP_TRANSHUGE GFP_TRANSHUGE_LIGHT|__GFP_DIRECT_RECLAIM 和GFP_TRANSHUGE_LIGHT类似,通常khugepaged使用该标志

alloc_context结构体

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
struct alloc_context {
struct zonelist *zonelist;
nodemask_t *nodemask;
struct zoneref *preferred_zoneref;
int migratetype;

/*
* highest_zoneidx represents highest usable zone index of
* the allocation request. Due to the nature of the zone,
* memory on lower zone than the highest_zoneidx will be
* protected by lowmem_reserve[highest_zoneidx].
*
* highest_zoneidx is also used by reclaim/compaction to limit
* the target zone since higher zone than this index cannot be
* usable for this allocation request.
*/
enum zone_type highest_zoneidx;
bool spread_dirty_pages;
};

这是一个分配过程中非常重要的结构体,用来表示我们单次内存分配的上下文信息。

这里主要关注的是其中的一个成员zonelist

1
2
3
struct zonelist {
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

该成员表示在这一次的分配上下文中,我们将要操作的 zone 的列表,其为一个 zonelist 类型的结构体数组。可以看到的是其为一个 zoneref 类型的结构体数组。

1
2
3
4
struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};

而这个结构体在上文中提过,包含了一个 zone 的指针以及一个 index。

随后再看alloc_context结构体中的preferred_zoneref成员,该成员为一个 zoneref 类型的结构体,表示优先用来进行分配的 zone。

最后则是spread_dirty_pages成员,表示此次分配是否可能产生脏页(需要进行写回),通常分配需要写入的页会出现。

__alloc_pages函数

此函数为buddy system分配页面的核心函数,所有的页面分配 API 都是基于该函数的封装。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };

/*
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
return NULL;

gfp &= gfp_allowed_mask;
/*
* Apply scoped allocation constraints. This is mainly about GFP_NOFS
* resp. GFP_NOIO which has to be inherited for all allocation requests
* from a particular context which has been marked by
* memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
* movable zones are not used during allocation.
*/
gfp = current_gfp_context(gfp);
alloc_gfp = gfp;
if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
&alloc_gfp, &alloc_flags))
return NULL;

/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);

/* First allocation attempt */
page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
if (likely(page))
goto out;

alloc_gfp = gfp;
ac.spread_dirty_pages = false;

/*
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
ac.nodemask = nodemask;

page = __alloc_pages_slowpath(alloc_gfp, order, &ac);

out:
if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
__free_pages(page, order);
page = NULL;
}

trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
kmsan_alloc_page(page, order, alloc_gfp);

return page;
}
EXPORT_SYMBOL(__alloc_pages);

函数需要传入四个参数,分别是gfp分配行为参数、order分配的物理页框的阶、preferred_nid选取的节点id、nodemask分配时可供候选的node掩码。

因为在前面的文章中已经提到过了内存分配,不过只是从slab中分配我们也大概清楚内存分配函数一般的流程,这里也不例外。首先,检查参数合法性,并做分配前准备工作。随后进行快速分配,成功则直接返回结果。最后,若快速分配失败,则进行慢速分配。

接下来就继续细致的开始分析函数了,进入函数会先检验order是否超过了MAX_ORDER。随后会进入到prepare_alloc_pages函数进行分配前的准备工作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_gfp,
unsigned int *alloc_flags)
{
ac->highest_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfp_migratetype(gfp_mask);

if (cpusets_enabled()) {
*alloc_gfp |= __GFP_HARDWALL;
/*
* When we are in the interrupt context, it is irrelevant
* to the current task context. It means that any node ok.
*/
if (in_task() && !ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
}

might_alloc(gfp_mask);

if (should_fail_alloc_page(gfp_mask, order))
return false;

*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);

/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);

return true;
}

首先调用node_zonelist函数从preferred_nid参数指定的node中获取一个zonelist

1
2
3
4
5
6
#define NODE_DATA(nid)		(node_data[nid])

static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

进行 cpuset 相关判断与标志位设置,若是在中断上下文则直接将 nodemask 设为 cpuset_current_mems_allowed

最后调用 first_zones_zonelist() 设置preferred zone,大概是在zonelistnodemask所包含的zone中 highest_zoneidx 以下的第一个 zone。

总的来说这个函数主要做的事就是分配前的一些准备的工作,包括初始化 alloc_context 结构体、获取 zone 数组等。

get_page_from_freelist函数

根据__alloc_pages函数的流程,接下来进入的就是get_page_from_freelist函数了,而这一函数就是前面提到的快速分配路径了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat = NULL;
bool last_pgdat_dirty_ok = false;
bool no_fallback;

retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;

if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a node that is within its dirty
* limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* XXX: For now, allow allocations to potentially
* exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* nodes are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
if (ac->spread_dirty_pages) {
if (last_pgdat != zone->zone_pgdat) {
last_pgdat = zone->zone_pgdat;
last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
}

if (!last_pgdat_dirty_ok)
continue;
}

if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;

/*
* If moving to a remote node, retry but allow
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
local_nid = zone_to_nid(ac->preferred_zoneref->zone);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
}

/*
* Detect whether the number of free pages is below high
* watermark. If so, we will decrease pcp->high and free
* PCP pages in free path to reduce the possibility of
* premature page reclaiming. Detection is done here to
* avoid to do that in hotter free path.
*/
if (test_bit(ZONE_BELOW_HIGH, &zone->flags))
goto check_alloc_wmark;

mark = high_wmark_pages(zone);
if (zone_watermark_fast(zone, order, mark,
ac->highest_zoneidx, alloc_flags,
gfp_mask))
goto try_this_zone;
else
set_bit(ZONE_BELOW_HIGH, &zone->flags);

check_alloc_wmark:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac->highest_zoneidx, alloc_flags,
gfp_mask)) {
int ret;

if (has_unaccepted_memory()) {
if (try_to_accept_memory(zone, order))
goto try_this_zone;
}

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
if (deferred_pages_enabled()) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;

if (!node_reclaim_enabled() ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;

ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac->highest_zoneidx, alloc_flags))
goto try_this_zone;

continue;
}
}

try_this_zone:
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);

/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
reserve_highatomic_pageblock(page, zone);

return page;
} else {
if (has_unaccepted_memory()) {
if (try_to_accept_memory(zone, order))
goto try_this_zone;
}

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (deferred_pages_enabled()) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
}
}

/*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
if (no_fallback) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}

return NULL;
}

函数的流程大体如下,使用for_next_zone_zonelist_nodemask宏迭代遍历分配上下文中的zonelist中的zoneref数组对应的zone

1
2
3
4
5
#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
for (zone = z->zone; \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask), \
zone = zonelist_zone(z))

这里的next_zones_zonelist函数返回的是在nodemask的 zone 中,以当前 zone 作为起点游标的(位于或低于)highest_zoneidx 的下一个 zone。

接下来进入循环体内部,首先会判断是否开启cpuset并检查当前 zone 是否满足 cpuset 的要求,若否,则尝试下一个 zone。

检查当前 zone 对应 node 的脏页数量是否超出限制,若否,则尝试下一个 zone。

alloc_flags包含ALLOC_NOFRAGMENT但是当前 zone 非preferred zone、且对应 node 部位local node,则清除该标志位后重新开始分配,因为 locality 比避免碎片更加重要。

后面则是对水位线的一些判断,首先是获取当前 zone 的水位线标记,若是设置了 ALLOC_NO_WATERMARKS 则直接到下一步进行分配,若水位线检查未通过,调用 node_reclaim() 进行页面回收,若回收后页面还是不足,则尝试下一个 zone。

在经历了前面的for循环之后最终调用rmqueue函数进行正式的内存分配。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
struct page *page;

/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));

if (likely(pcp_allowed_order(order))) {
page = rmqueue_pcplist(preferred_zone, zone, order,
migratetype, alloc_flags);
if (likely(page))
goto out;
}

page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
migratetype);

out:
/* Separate test+clear to avoid unnecessary atomics */
if ((alloc_flags & ALLOC_KSWAPD) &&
unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}

VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
}

函数进入之后会首先验证申请的order是否满足通过pcp申请

1
2
3
4
5
6
7
8
9
10
static inline bool pcp_allowed_order(unsigned int order)
{
if (order <= PAGE_ALLOC_COSTLY_ORDER)
return true;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order == pageblock_order)
return true;
#endif
return false;
}

这里的pcp指的是per-cpu pageset,这里简单介绍一下per-cpu pageset的概念(在老版本的Linux内核中这的order一般只允许为0时从pcp中申请,发现在新版本的内核中好像是小于等于3即可),在Linux系统中,存在很多阶为0的申请请求,如果每一次的申请请求都需要获取zone->lock的话,在越多的cpu核心的情况下就会出现越多的锁的竞争。为了解决这一问题,Linux采用的办法就是使用per-cpu pageset即pcp,其会一次性拿很多页存放在cpu中,并且释放的页也会继续存放在这里,等释放的页满了才会一并放回到zone中(我的理解就是一个cache)。

在通过order的检验之后就会进入内部调用rmqueue_pcplist函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
unsigned long __maybe_unused UP_flags;

/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (!pcp) {
pcp_trylock_finish(UP_flags);
return NULL;
}

/*
* On allocation, reduce the number of pages that are batch freed.
* See nr_pcp_free() where free_factor is increased for subsequent
* frees.
*/
pcp->free_count >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
}
return page;
}

rmqueue_pcplist函数的基本流程就是首先获取掉pcp,随后通过order和迁移类型选择对应的list(当然在老版本的Linux内核中是不需要order的),随后调用__rmqueue_pcplist函数真正申请页。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
{
struct page *page;

do {
if (list_empty(list)) {
int batch = nr_pcp_alloc(pcp, zone, order);
int alloced;

alloced = rmqueue_bulk(zone, order,
batch, list,
migratetype, alloc_flags);

pcp->count += alloced << order;
if (unlikely(list_empty(list)))
return NULL;
}

page = list_first_entry(list, struct page, pcp_list);
list_del(&page->pcp_list);
pcp->count -= 1 << order;
} while (check_new_pages(page, order));

return page;
}

__rmqueue_pcplist函数的逻辑是比较清晰的,首先会判断list是否为空,如果为空则调用rmqueue_bulk函数从zone取页到pcp中,如果不为空则直接取页面随后通过check_new_pages函数进行检测,通过则直接返回页。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
unsigned long flags;
int i;

spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
if (unlikely(page == NULL))
break;

/*
* Split buddy pages returned by expand() are received here in
* physical page order. The page is added to the tail of
* caller's list. From the callers perspective, the linked list
* is ordered by page number under some conditions. This is
* useful for IO devices that can forward direction from the
* head, thus also in the physical page order. This is useful
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
list_add_tail(&page->pcp_list, list);
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock_irqrestore(&zone->lock, flags);

return i;
}

这里简单看一下rmqueue_bulk函数,这里会进行batch次循环通过__rmqueue函数申请page并向链表中加入。这里函数内部的__rmqueue函数留在后面一并梳理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static __always_inline
struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
unsigned int order, unsigned int alloc_flags,
int migratetype)
{
struct page *page;
unsigned long flags;

do {
page = NULL;
spin_lock_irqsave(&zone->lock, flags);
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
page = __rmqueue(zone, order, migratetype, alloc_flags);

/*
* If the allocation fails, allow OOM handling access
* to HIGHATOMIC reserves as failing now is worse than
* failing a high-order atomic allocation in the
* future.
*/
if (!page && (alloc_flags & ALLOC_OOM))
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);

if (!page) {
spin_unlock_irqrestore(&zone->lock, flags);
return NULL;
}
}
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
spin_unlock_irqrestore(&zone->lock, flags);
} while (check_new_pages(page, order));

__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);

return page;
}

这里回到rmqueue函数的流程,如果申请的order不能直接通过pcp进行申请是则会进入rmqueue_buddy函数中。函数内部其会根据不同的alloc_flags通过不同的函数进行分配。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
unsigned int alloc_flags)
{
struct page *page;

if (IS_ENABLED(CONFIG_CMA)) {
/*
* Balance movable allocations between regular and CMA areas by
* allocating from CMA when over half of the zone's free memory
* is in the CMA area.
*/
if (alloc_flags & ALLOC_CMA &&
zone_page_state(zone, NR_FREE_CMA_PAGES) >
zone_page_state(zone, NR_FREE_PAGES) / 2) {
page = __rmqueue_cma_fallback(zone, order);
if (page)
return page;
}
}
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (alloc_flags & ALLOC_CMA)
page = __rmqueue_cma_fallback(zone, order);

if (!page && __rmqueue_fallback(zone, order, migratetype,
alloc_flags))
goto retry;
}
return page;
}

先看__rmqueue函数,可以发现其内部实际调用的还是__rmqueue_smallest函数进行分配的,不过前面会判断是否分配CMA,如果是则直接调用__rmqueue_cma_fallback进行分配。

1
2
3
4
5
6
7
8
9
10
#ifdef CONFIG_CMA
static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order)
{
return __rmqueue_smallest(zone, order, MIGRATE_CMA);
}
#else
static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order) { return NULL; }
#endif

看其定义其实也只是修改了迁移类型之后调用__rmqueue_smallest函数,而整个buddy system最核心的函数就是__rmqueue_smallest函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;

/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
trace_mm_page_alloc_zone_locked(page, order, migratetype,
pcp_allowed_order(order) &&
migratetype < MIGRATE_PCPTYPES);
return page;
}

return NULL;
}

希望各位还记得在前文struct zone段的内容,这里用文字给大家简要的回忆一下,如果记不起来最好可以点旁边的大纲看看图片。在zone中存在free_area字段用以存储buddy system按照 order 管理的页面,为一个 free_area 结构体数组,而其索引就是页所对应的order。并且free_area本身就是一个结构体,内部存在一个free_list成员其索引对应的是不同的迁移类型。

在经过简单的回忆之后我们继续看__rmqueue_smallest函数,这里会从传入的order开始进行循环,并且通过order在zone中找到对应的free_area,随后通过迁移类型获取到对应的页。在获取到对应的页之后就会进行断链操作,这里重点看一下expand函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static inline void expand(struct zone *zone, struct page *page,
int low, int high, int migratetype)
{
unsigned long size = 1 << high;

while (high > low) {
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
if (set_page_guard(zone, &page[size], high, migratetype))
continue;

add_to_free_list(&page[size], zone, high, migratetype);
set_buddy_order(&page[size], high);
}
}

在发现current_order大于传入的order时会进入循环,对大的order-1随后将size减半,随后将后半段存入到链表中并设置新的order即这里的high,前半段的page继续重复以上操作直到high == low。

__alloc_pages_slowpath函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
enum compact_priority compact_priority;
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
unsigned int zonelist_iter_cookie;
int reserve_flags;

restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();

/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask, order);

/*
* We need to recalculate the starting point for the zonelist iterator
* because we might have used different nodemask in the fast path, or
* there was a cpuset modification and we are retrying - otherwise we
* could end up iterating over non-eligible zones endlessly.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;

/*
* Check for insane configurations where the cpuset doesn't contain
* any suitable zone to satisfy the request - e.g. non-movable
* GFP_HIGHUSER allocations from MOVABLE nodes only.
*/
if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
struct zoneref *z = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx,
&cpuset_current_mems_allowed);
if (!z->zone)
goto nopage;
}

if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);

/*
* The adjusted alloc_flags might result in immediate success, so try
* that first
*/
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;

/*
* For costly allocations, try direct compaction first, as it's likely
* that we have enough base pages and don't need to reclaim. For non-
* movable high-order allocations, do that as well, as compaction will
* try prevent permanent fragmentation by migrating from blocks of the
* same migratetype.
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
if (can_direct_reclaim &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
if (page)
goto got_pg;

/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes some THP page fault allocations
*/
if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If allocating entire pageblock(s) and compaction
* failed because all zones are below low watermarks
* or is prohibited because it recently failed at this
* order, fail immediately unless the allocator has
* requested compaction and reclaim retry.
*
* Reclaim is
* - potentially very expensive because zones are far
* below their low watermarks or this is part of very
* bursty high order allocations,
* - not guaranteed to help because isolate_freepages()
* may not iterate over freed pages as part of its
* linear scan, and
* - unlikely to make entire pageblocks free on its
* own.
*/
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage;

/*
* Looks like reclaim/compaction is worth trying, but
* sync compaction could be very expensive, so keep
* using async compaction.
*/
compact_priority = INIT_COMPACT_PRIORITY;
}
}

retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);

reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
(alloc_flags & ALLOC_KSWAPD);

/*
* Reset the nodemask and zonelist iterators if memory policies can be
* ignored. These allocations are high priority and system rather than
* user oriented.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
}

/* Attempt with potentially adjusted zonelist and alloc_flags */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;

/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim)
goto nopage;

/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;

/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;

/* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;

/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
goto nopage;

/*
* Do not retry costly high order allocations unless they are
* __GFP_RETRY_MAYFAIL
*/
if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;

if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;

/*
* It doesn't make any sense to retry for the compaction if the order-0
* reclaim is not able to make any progress because the current
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
goto retry;


/*
* Deal with possible cpuset update races or zonelist updates to avoid
* a unnecessary OOM kill.
*/
if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
check_retry_zonelist(zonelist_iter_cookie))
goto restart;

/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;

/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
(alloc_flags & ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;

/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = 0;
goto retry;
}

nopage:
/*
* Deal with possible cpuset update races or zonelist updates to avoid
* a unnecessary OOM kill.
*/
if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
check_retry_zonelist(zonelist_iter_cookie))
goto restart;

/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/
if (gfp_mask & __GFP_NOFAIL) {
/*
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
goto fail;

/*
* PF_MEMALLOC request from this context is rather bizarre
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);

/*
* non failing costly orders are a hard requirement which we
* are not prepared for much so let's warn about these users
* so that we can identify them and convert them to something
* else.
*/
WARN_ON_ONCE_GFP(costly_order, gfp_mask);

/*
* Help non-failing allocations by giving some access to memory
* reserves normally used for high priority non-blocking
* allocations but do not use ALLOC_NO_WATERMARKS because this
* could deplete whole memory reserves which would just make
* the situation worse.
*/
page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
if (page)
goto got_pg;

cond_resched();
goto retry;
}
fail:
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
return page;
}

相信各位都在操作系统课程上学习过页迁移机制,在这里也简单说一下内核中的Memory compaction机制,就是整理内存碎片,对零散的内存页进行迁移,从而将零散的空闲内存页变成大块的空闲内存,不过这里只整理可以移动的碎片。

现在开始分析一下整个函数的流程吧,首先使用原有的 gfp_flag 重新设置 alloc_flag,并重新计算preferred zone,若设置了 ALLOC_KSWAPD 则调用 wake_all_kswapds() 唤醒 kswapd 线程进行内存回收。

随后直接重新尝试快速路径分配,如果成功则直接返回。

接下来调用 __alloc_pages_direct_compact() 进行compaction,该函数内部在整理完后会重新尝试快速路径的分配,若成功则直接返回。

随后进入retry分支调用 wake_all_kswapds() 唤醒 kswapd 线程进行内存回收。

调整 zonelist 与alloc_flag,之后再次尝试快速路径分配,若成功则直接返回。

若 gfp_flag 中没有 __GFP_DIRECT_RECLAIM 或是进程 PCB 的 flag 中有 PF_MEMALLOC,直接跳转到nopage分支。

随后调用 __alloc_pages_direct_reclaim() 进行内存回收(内部调用 __perform_reclaim())与快速路径分配,若成功则直接返回。

然后调用 __alloc_pages_direct_compact() 进行 compaction 与快速路径分配,若成功则直接返回。

如果设置了 __GFP_NORETRY ,或是该次内存分配开销较高(order > PAGE_ALLOC_COSTLY_ORDER)且未设置 __GFP_RETRY_MAYFAIL,直接跳到nopage标签。

调用 should_reclaim_retry() 判断是否需要重新回收,若是则跳回retry标签。

调用 should_compact_retry() 判断是否需要重新进行 compaction,若是则跳回retry标签。

调用 check_retry_cpuset() 检查 cpuset 是否发生变化,若是则跳转回开头即restart标签。

在前面的检测都没跳转则调用 __alloc_pages_may_oom() 尝试 kill 一些进程来释放内存,该函数内首先还是会先进行一次快速分配,之后才是调用 out_of_memory() 来杀掉最适合的进程以释放内存,最后若设置了 __GFP_NOFAIL 则调用 __alloc_pages_cpuset_fallback() 再次尝试内存分配。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static inline struct page *
__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags,
const struct alloc_context *ac)
{
struct page *page;

page = get_page_from_freelist(gfp_mask, order,
alloc_flags|ALLOC_CPUSET, ac);
/*
* fallback to ignore cpuset restriction if our nodes
* are depleted
*/
if (!page)
page = get_page_from_freelist(gfp_mask, order,
alloc_flags, ac);

return page;
}

该函数中会两次走快速路径进行分配,在第一次会额外附加上 ALLOC_CPUSET 的 flag。

继续看__alloc_pages_slowpath函数流程,如果把当前进程杀掉了,跳到nopage标签,如果杀进程取得了成效,跳回retry标签。

最后进入nopage标签调用 check_retry_cpuset() 检查 cpuset 是否发生变化,若是则跳转回开头即restart标签。

若设置了 __GFP_NOFAIL 则进行一系列的警告,并调用 __alloc_pages_cpuset_fallback() 再次尝试内存分配,若未成功则跳回retry标签。最后就是返回结果了。

上面是偷的图,其实通过图能够更清晰的理清楚slow path的逻辑。

上层封装的分配函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
warn_if_node_offline(nid, gfp_mask);

return __alloc_pages(gfp_mask, order, nid, NULL);
}

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();

return __alloc_pages_node(nid, gfp_mask, order);
}

static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
return alloc_pages_node(numa_node_id(), gfp_mask, order);
}

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;

page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);

__alloc_pages上层有这样一些分配函数,只不过返回的内容不一样,在__get_free_pages函数返回的是虚拟地址,alloc_pages函数返回的是page结构体指针。

__free_one_page函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/*
* Freeing function for a buddy system allocator.
*
* The concept of a buddy system is to maintain direct-mapped table
* (containing bit values) for memory blocks of various "orders".
* The bottom level table contains the map for the smallest allocatable
* units of memory (here, pages), and each level above it describes
* pairs of units from the levels below, hence, "buddies".
* At a high level, all that happens here is marking the table entry
* at the bottom level available, and propagating the changes upward
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PageBuddy.
* Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
* If a block is freed, and its buddy is also free, then this
* triggers coalescing into a block of larger size.
*
* -- nyc
*/

static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn = 0;
unsigned long combined_pfn;
struct page *buddy;
bool to_tail;

VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);

VM_BUG_ON(migratetype == -1);
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);

VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);

while (order < MAX_ORDER) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
return;
}

buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
if (!buddy)
goto done_merging;

if (unlikely(order >= pageblock_order)) {
/*
* We want to prevent merge between freepages on pageblock
* without fallbacks and normal pageblock. Without this,
* pageblock isolation could cause incorrect freepage or CMA
* accounting or HIGHATOMIC accounting.
*/
int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);

if (migratetype != buddy_mt
&& (!migratetype_is_mergeable(migratetype) ||
!migratetype_is_mergeable(buddy_mt)))
goto done_merging;
}

/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
del_page_from_free_list(buddy, zone, order);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}

done_merging:
set_buddy_order(page, order);

if (fpi_flags & FPI_TO_TAIL)
to_tail = true;
else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);

if (to_tail)
add_to_free_list_tail(page, zone, order, migratetype);
else
add_to_free_list(page, zone, order, migratetype);

/* Notify page reporting subsystem of freed page */
if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}

该函数,主要作用是将特定页面释放到特定 zone 上,需要注意的是这里的 one page 不是一张页框而是一块连续内存(可能有多张页)。这是一个释放页面的基本函数,故我们需要提供待释放页面的页结构体(struct page)、页框号、页面块的阶(order)、目标 zone、迁移类型等信息——这些信息通常由上层封装函数提供,这个函数所做的只是简单地将页挂回对应链表并检查合并的操作。该函数是 buddy system 中用以进行页面释放的核心函数,所有的页面释放 API 都是基于该函数的封装。

我们将与待释放页面凑成一对的内存块称为 buddy,所谓凑成一对便是这两个内存块在物理上连续,且能凑成一个更高一阶的大内存块,由此称之为一对 buddies。

接下来开始分析一下这个函数的基本流程:

首先这里通过find_buddy_page_pfn函数寻找并检查buddy。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
static inline struct page *find_buddy_page_pfn(struct page *page,
unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
{
unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
struct page *buddy;

buddy = page + (__buddy_pfn - pfn);
if (buddy_pfn)
*buddy_pfn = __buddy_pfn;

if (page_is_buddy(page, buddy, order))
return buddy;
return NULL;
}

首先会根据待释放堆块的pfn和order寻找到buddy的pfn,随后通过计算或得到buddy的page指针,随后调用page_is_buddy来判断是否满足buddy条件。主要是以下四个条件:

  • buddy 不在空洞中
  • buddy 在 buddy system 中(即 buddy 也是空闲内存块)
  • 待释放页面与其 buddy 在同一个 zone 中
  • 待释放页面与其 buddy 有着同样的阶(order)

随后进行判断如果order达到了pageblock_order那么就意味着常规的pageblock可能会与独立的pageblock进行合并,为了预防这一现象,内核在这里存在一处判断,即if (unlikely(order >= pageblock_order)) {这里会判断所选择的buddy是否为mergeable,如果是可以继续增加order,如果不是则直接进入到done_merging标签。

循环最后验证此buddy 是否为 guard page,如果是则调用 clear_page_guard() 清除这个属性让其变成空闲页面,这里清除的操作是通过将 page 结构体的 private 字段置 0 实现的;若否,则说明是常规的空闲页面,调用 del_page_from_free_list() 将其脱链。

经过前面的操作后我们的新的高阶内存块就完成合成了,接下来我们回到循环开头重新寻找这个合成的新内存块的 buddy,这个循环一直持续到 max_order (一般是10),作为下一次循环的页框号的计算方式是 buddy_pfn & pfn,之后做指针运算 page + (combined_pfn - pfn) 找到对应的 page 结构体。

最后进入到done_merging标签,这一步主要是调用 set_buddy_order() 在 page 结构体的 private 字段存放该内存块的 order,若是设置了 FPI_TO_TAIL flag,则将 to_tail 置为 true;否则,若内存块的 order >= SHUFFLE_ORDERMAX_ORDER - 1),则将 to_tail 置为随机结果(shuffle_pick_tail());否则置为调用 buddy_merge_likely() 的结果,该函数会检查是否下一个最高阶的 buddy 是否空闲,若是,则可能正在释放的页面块将很快被合并,此时我们应当将其添加到链表的尾部,这样就不大可能又被别的进程很快就分配走了,而是可能被合并为高阶页面。若 to_tail 为真,则调用 add_to_free_list_tail() 将该空闲页添加到链表末尾,否则调用 add_to_free_list() 添加到链表开头。

上层封装的释放函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page);

if (!free_pages_prepare(page, order, fpi_flags))
return;

/*
* Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
* is used to avoid calling get_pfnblock_migratetype() under the lock.
* This will reduce the lock holding time.
*/
migratetype = get_pfnblock_migratetype(page, pfn);

spin_lock_irqsave(&zone->lock, flags);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);

__count_vm_events(PGFREE, 1 << order);
}

static inline void free_the_page(struct page *page, unsigned int order)
{
if (pcp_allowed_order(order)) /* Via pcp? */
free_unref_page(page, order);
else
__free_pages_ok(page, order, FPI_NONE);
}

void __free_pages(struct page *page, unsigned int order)
{
/* get PageHead before we drop reference */
int head = PageHead(page);

if (put_page_testzero(page))
free_the_page(page, order);
else if (!head)
while (order-- > 0)
free_the_page(page + (1 << order), order);
}
EXPORT_SYMBOL(__free_pages);

void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
__free_pages(virt_to_page((void *)addr), order);
}
}

EXPORT_SYMBOL(free_pages);

所有页面释放的函数其实都是对 __free_one_page() 的封装,最终都会调用到这个函数,前面是其中一条路的路径。

后记

🙏🙏🙏,这段时间实在是过于懒狗了,这是一篇2023年11月29号创建的文章,一直拖到今天才写完并发出来,虽然这段时间确实时不时会有一些事不过也不会耽误太多时间,说到底是我犯懒有时间不想写,其实就算晚上去玩游戏也是能早早写完了的,不做懒狗了!

后续关于syzkaller的文章可能会缓一段时间再继续写了,因为自己的fuzz基础其实不是很扎实,所以我打算先去学学怎样自己去实现一个fuzz再去看,如果直接看就算把逻辑理清楚了也没法自己动手改进之类的。

所以后续的更新大概率是Linux Rootkit,用户态fuzz实现,从0开始写操作系统等。

 评论
评论插件加载失败
正在加载评论插件
由 Hexo 驱动 & 主题 Keep
本站由 提供部署服务
总字数 335.6k 访客数 访问量