内存管理专题04 伙伴分配器原理( 二 )


包含所有内存节点的备用区域列表有两种排序方法:
1)节点优先顺序:先根据节点距离从小到大排序,然后在每个节点里面根据区域类型从高到低排序 。优先选择距离近的内存,缺点是在高区域耗尽以前使用低区域 。比如DMA区域一般是比较小,节点优先顺序会增大DMA区域耗尽的概率 。
2)区域优先顺序:先根据区域类型从高到低排序,然后在每个区域类型里面根据节点距离从小到大排序 。优点是主要减少低区域耗尽的概率,缺点是不能保证优先选择距离比较近的内存 。
比如默认的排序方法就是自动会选择最优的排序方法:比如是64位操作系统,因为需要DMA和DMA32区域的备用列表相对较少,所以选择节点优先顺序;如果是32位系统,选择区域优先顺序 。
5.区域水线
首选的内存区域什么情况下从备用区域借用物理页呢?每个内存区域有3个水线
1)高水线(high):如果内存区域的空闲页数大于高水线,说明内存区域的内存充足;
2)低水线(low):如果内存区域的空闲页数小于低水线,说明内存区域的内存轻微不足;
3)最低水线(min):如果内存区域的空闲页数小于最低水线,说明内存区域的内存严重不足 。
数据结构如下:
最低水线以下的内存称为紧急保留内存,在内存严重不足的情况下,给承诺“分给我们少量的紧急保留内存使用,我们可以释放更多的内存”的进程使用 。
水位控制内核源码重要数据参数
HIGH/LOW/MIN三个水位值是可以计算出来的?
long ;//伙伴分配器管理的物理页的数量
//代表这个zone当中所有的页,包含空洞,计算公式:-
long ;//代表zone当中可用的所有物理页,包括空洞
//计算公式:-
long ;//当前区域存在的物理页数量,不包括空洞
//代表通过buddy管理所有可用的页,计算公式-
他们三者之间的关系:>> 。
代表的是系统保留空闲内存的最低限;
[]是通过计算出来 。
二、分配页
在Linux内核中,所有分配页的函数最终都会调用到sk,此函数被称为分区的伙伴分配器的心脏 。
1.分区的伙伴分配器核心函数
:分配的一个标志位;
oredr:阶数;
:首选内节点的备用区域列表;
:允许从哪里内存节点分配页,如果调用者没有要求,可以传入空指针 。
算法流程:
1)根据分配标志位得到首选区域类型和迁移类型;
2)执行快速路径,使用低水线尝试第一次分配;
3)如果快速路径分配失败,才执行慢速路径 。
页分配器定义内部分配标志位:
2.快速路径调用函数如下:
3.慢速路径调用函数内核源码分析(如果低水线分配失败,则执行慢速路径):
static inline struct page *__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,struct alloc_context *ac){bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;struct page *page = NULL;unsigned int alloc_flags;unsigned long did_some_progress;enum compact_priority compact_priority;enum compact_result compact_result;int compaction_retries;int no_progress_loops;unsigned long alloc_start = jiffies;unsigned int stall_timeout = 10 * HZ;unsigned int cpuset_mems_cookie;/** In the slowpath, we sanity check order to avoid ever trying to* reclaim >= MAX_ORDER areas which will never succeed. Callers may* be using allocators in order of preference for an area that is* too large.*///申请的阶数不能超过页分配器支持的最大分配阶if (order >= MAX_ORDER) {WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));return NULL;}if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))gfp_mask &= ~__GFP_ATOMIC;retry_cpuset:compaction_retries = 0;no_progress_loops = 0;compact_priority = DEF_COMPACT_PRIORITY;/*后面可能会检查cpuset是否允许当前进程从哪些内存节点申请页,需要读当前进程成员mems_allowed,使用顺序锁保护*/cpuset_mems_cookie = read_mems_allowed_begin();//把分配标志位转成内部分配标志位alloc_flags = gfp_to_alloc_flags(gfp_mask);//获取首选的内存区域ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,ac->high_zoneidx, ac->nodemask);if (!ac->preferred_zoneref->zone)goto nopage;//异步回收页,唤醒页回收的线程if (gfp_mask & __GFP_KSWAPD_RECLAIM)wake_all_kswapds(order, ac);//使用最低水线分配页page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page)goto got_pg;//针对申请的阶数大于0,满足三个条件if (can_direct_reclaim &&(costly_order ||(order > 0 && ac->migratetype != MIGRATE_MOVABLE))&& !gfp_pfmemalloc_allowed(gfp_mask)) {page = __alloc_pages_direct_compact(gfp_mask, order,alloc_flags, ac,INIT_COMPACT_PRIORITY,&compact_result);if (page)goto got_pg;/** Checks for costly allocations with __GFP_NORETRY, which* includes THP page fault allocations*/if (costly_order && (gfp_mask & __GFP_NORETRY)) {/** If compaction is deferred for high-order allocations,* it is because sync compaction recently failed. If* this is the case and the caller requested a THP* allocation, we do not want to heavily disrupt the* system, so we fail the allocation instead of entering* direct reclaim.*/if (compact_result == COMPACT_DEFERRED)goto nopage;/** Looks like reclaim/compaction is worth trying, but* sync compaction could be very expensive, so keep* using async compaction.*/compact_priority = INIT_COMPACT_PRIORITY;}}retry:/* 确保页回收线程在我们循环的时候不会意外的睡眠 */if (gfp_mask & __GFP_KSWAPD_RECLAIM)wake_all_kswapds(order, ac);if (gfp_pfmemalloc_allowed(gfp_mask))alloc_flags = ALLOC_NO_WATERMARKS;/** Reset the zonelist iterators if memory policies can be ignored.* These allocations are high priority and system rather than user* orientated.*/if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,ac->high_zoneidx, ac->nodemask);}/* 使用可能调整过的区域列表和分配标志深度 */page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page)goto got_pg;/* Caller is not willing to reclaim, we can't balance anything */if (!can_direct_reclaim)goto nopage;/* Make sure we know about allocations which stall for too long */if (time_after(jiffies, alloc_start + stall_timeout)) {warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,"page allocation stalls for %ums, order:%u",jiffies_to_msecs(jiffies-alloc_start), order);stall_timeout += 10 * HZ;}/* Avoid recursion of direct reclaim */if (current->flags & PF_MEMALLOC)goto nopage;/* Try direct reclaim and then allocating *///直接回收页page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,&did_some_progress);if (page)goto got_pg;/* Try direct compaction and then allocating *///针对申请阶数大于0,执行同步模式的内存碎片整理page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,compact_priority, &compact_result);if (page)goto got_pg;/* Do not loop if specifically requested *///如果调用者要求,不要重试,那么放弃if (gfp_mask & __GFP_NORETRY)goto nopage;/** Do not retry costly high order allocations unless they are* __GFP_REPEAT*/if (costly_order && !(gfp_mask & __GFP_REPEAT))goto nopage;if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,did_some_progress > 0, &no_progress_loops))goto retry;/** It doesn't make any sense to retry for the compaction if the order-0* reclaim is not able to make any progress because the current* implementation of the compaction depends on the sufficient amount* of free memory (see __compaction_suitable)*///申请阶数大于0:判断是否应该重试内存碎片整理if (did_some_progress > 0 &&should_compact_retry(ac, order, alloc_flags,compact_result, &compact_priority,&compaction_retries))goto retry;/** It's possible we raced with cpuset update so the OOM would be* premature (see below the nopage: label for full explanation).*///如果cpuset修改允许当前进程从哪些内存节点来申请页,if (read_mems_allowed_retry(cpuset_mems_cookie))goto retry_cpuset;/* Reclaim has failed us, start killing things *///使用内存耗尽杀手选择一个进程杀死page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);if (page)goto got_pg;/* Avoid allocations with no watermarks from looping endlessly *///如果当前进程正在被内存耗尽杀手杀死,并且忽略水线或者不允许使用紧急保留内存 。if (test_thread_flag(TIF_MEMDIE) &&(alloc_flags == ALLOC_NO_WATERMARKS ||(gfp_mask & __GFP_NOMEMALLOC)))goto nopage;/* Retry as long as the OOM killer is making progress *///如果内存耗尽杀手取得进展,则重试if (did_some_progress) {no_progress_loops = 0;goto retry;}nopage:/** When updating a task's mems_allowed or mempolicy nodemask, it is* possible to race with parallel threads in such a way that our* allocation can fail while the mask is being updated. If we are about* to fail, check if the cpuset changed during allocation and if so,* retry.*/if (read_mems_allowed_retry(cpuset_mems_cookie))goto retry_cpuset;/** Make sure that __GFP_NOFAIL request doesn't leak out and make sure* we always retry*/if (gfp_mask & __GFP_NOFAIL) {/** All existing users of the __GFP_NOFAIL are blockable, so warn* of any new users that actually require GFP_NOWAIT*/if (WARN_ON_ONCE(!can_direct_reclaim))goto fail;/** PF_MEMALLOC request from this context is rather bizarre* because we cannot reclaim anything and only can loop waiting* for somebody to do a work for us*/WARN_ON_ONCE(current->flags & PF_MEMALLOC);/** non failing costly orders are a hard requirement which we* are not prepared for much so let's warn about these users* so that we can identify them and convert them to something* else.*/WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);/** Help non-failing allocations by giving them access to memory* reserves but do not use ALLOC_NO_WATERMARKS because this* could deplete whole memory reserves which would just make* the situation worse*/page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);if (page)goto got_pg;cond_resched();goto retry;}fail:warn_alloc(gfp_mask, ac->nodemask,"page allocation failure: order:%u", order);got_pg:return page;}