zoukankan html css js c++ java

Linux内存管理 (4)分配物理页面

专题：Linux内存管理专题

关键词：分配掩码、伙伴系统、水位(watermark)、空闲伙伴块合并。

我们知道Linux内存管理是以页为单位进行的，对内存的管理是通过伙伴系统进行。

从Linux内存管理框架图可知，页面分配器是其他林林总总内存操作的基础。

这也是为什么在介绍了《Linux内存管理 (1)物理内存初始化》、《Linux内存管理 (2)页表的映射过程》、《Linux内存管理 (3)内核内存的布局图》之后，紧接着就要弄明白页面分配器的原因。

1. 重要数据结构

1.1 页面分配掩码

alloc_pages是内核中常用的分配物理内存页面的接口函数，他有两个参数，其中一个就是分配掩码。

includelinuxgfp.h存放了GFP(Get Free Page)分配掩码，分配掩码可以分为两类：以__GFP_开头的分配掩码；以GFP_开头的一般是__GFP_的组合。

__GFP_掩码分为两大类：zone modifiers和action modifiers。

zone modifiers是掩码的低4位，用来指定从那个zone分配页面。

action modifiers定义了分配页面的属性

/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA        0x01u
#define ___GFP_HIGHMEM        0x02u
#define ___GFP_DMA32        0x04u
#define ___GFP_MOVABLE        0x08u
#define ___GFP_WAIT        0x10u
#define ___GFP_HIGH        0x20u
#define ___GFP_IO        0x40u
#define ___GFP_FS        0x80u
#define ___GFP_COLD        0x100u
#define ___GFP_NOWARN        0x200u
#define ___GFP_REPEAT        0x400u
#define ___GFP_NOFAIL        0x800u
#define ___GFP_NORETRY        0x1000u
#define ___GFP_MEMALLOC        0x2000u
#define ___GFP_COMP        0x4000u
#define ___GFP_ZERO        0x8000u
#define ___GFP_NOMEMALLOC    0x10000u
#define ___GFP_HARDWALL        0x20000u
#define ___GFP_THISNODE        0x40000u
#define ___GFP_RECLAIMABLE    0x80000u
#define ___GFP_NOTRACK        0x200000u
#define ___GFP_NO_KSWAPD    0x400000u
#define ___GFP_OTHER_NODE    0x800000u
#define ___GFP_WRITE        0x1000000u
/* If the above are modified, __GFP_BITS_SHIFT may need updating */

在实际使用中多使用GFP_开头的掩码：

/* This equals 0, but use constants in case they ever change */
#define GFP_NOWAIT    (GFP_ATOMIC & ~__GFP_HIGH)
/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
#define GFP_ATOMIC    (__GFP_HIGH)
#define GFP_NOIO    (__GFP_WAIT)
#define GFP_NOFS    (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL    (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_TEMPORARY    (__GFP_WAIT | __GFP_IO | __GFP_FS | 
             __GFP_RECLAIMABLE)
#define GFP_USER    (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_HIGHUSER    (GFP_USER | __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE    (GFP_HIGHUSER | __GFP_MOVABLE)
#define GFP_IOFS    (__GFP_IO | __GFP_FS)
#define GFP_TRANSHUGE    (GFP_HIGHUSER_MOVABLE | __GFP_COMP | 
             __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | 
             __GFP_NO_KSWAPD)

/*
 * GFP_THISNODE does not perform any reclaim, you most likely want to
 * use __GFP_THISNODE to allocate from a given node without fallback!
 */
#ifdef CONFIG_NUMA
#define GFP_THISNODE    (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
#else
#define GFP_THISNODE    ((__force gfp_t)0)
#endif

/* This mask makes up all the page movable related flags */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)

/* Control page allocator reclaim behavior */
#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|
            __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|
            __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)

/* Control slab gfp mask during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))

/* Control allocation constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)

/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

/* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
   platforms, used as appropriate on others */

#define GFP_DMA        __GFP_DMA

/* 4GB DMA on some platforms */
#define GFP_DMA32    __GFP_DMA32

2. 伙伴系统分配内存

alloc_page-------------------------------分配单页
get_zeroed_page-->__get_free_pages
    alloc_pages--------------------------分配2^odrder个页面
        alloc_pages_node-----------------增加node id参数
            __alloc_pages
                __alloc_pages_node_mask--增加nodemaks参数

__alloc_pages_nodemask is the 'heart' of the zoned buddy allocator.

首先__alloc_pages_nodemask很重要，其次说明了这里的伙伴页面分配器是基于Zone的。

struct alloc_context是伙伴系统分配函数中用于保存相关参数的数据结构。

struct alloc_context {
    struct zonelist *zonelist;
    nodemask_t *nodemask;
    struct zone *preferred_zone;
    int classzone_idx;
    int migratetype;
    enum zone_type high_zoneidx;
};

这里的zonelist，已经通过node_zonelist(nid, gfp_mask)得到：zonelist=NODE_DATA(nid)->node_zonelists+gfp_zonelist(flags)

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
            struct zonelist *zonelist, nodemask_t *nodemask)
{
    struct zoneref *preferred_zoneref;
    struct page *page = NULL;
    unsigned int cpuset_mems_cookie;
    int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
    gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
    struct alloc_context ac = {
        .high_zoneidx = gfp_zone(gfp_mask),----------------------------------gfp_zone根据gfp_mask低4位，找到对应的zone_type。ZONE_NORMAL？ZONE_HIGHMEM？
        .nodemask = nodemask,
        .migratetype = gfpflags_to_migratetype(gfp_mask),--------------------根据gfp_mask得出页面migratetype，是MIGRATE_RECLAIMABLE？MIGRATE_MOVABLE？
    };

    gfp_mask &= gfp_allowed_mask;

    lockdep_trace_alloc(gfp_mask);

    might_sleep_if(gfp_mask & __GFP_WAIT);

    if (should_fail_alloc_page(gfp_mask, order))
        return NULL;

    /*
     * Check the zones suitable for the gfp_mask contain at least one
     * valid zone. It's possible to have an empty zonelist as a result
     * of GFP_THISNODE and a memoryless node
     */
    if (unlikely(!zonelist->_zonerefs->zone))
        return NULL;

    if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
        alloc_flags |= ALLOC_CMA;

retry_cpuset:
    cpuset_mems_cookie = read_mems_allowed_begin();

    /* We set it here, as __alloc_pages_slowpath might have changed it */
    ac.zonelist = zonelist;
    /* The preferred zone is used for statistics later */
    preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                ac.nodemask ? : &cpuset_current_mems_allowed,
                &ac.preferred_zone);
    if (!ac.preferred_zone)
        goto out;
    ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);

    /* First allocation attempt */
    alloc_mask = gfp_mask|__GFP_HARDWALL;
    page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);---------尝试分配物理页面
    if (unlikely(!page)) {
        /*
         * Runtime PM, block IO and its error handling path
         * can deadlock because I/O on the device might not
         * complete.
         */
        alloc_mask = memalloc_noio_flags(gfp_mask);

        page = __alloc_pages_slowpath(alloc_mask, order, &ac);-----------------如果分配失败，则在这里进行很多特殊场景的处理。
    }

    if (kmemcheck_enabled && page)
        kmemcheck_pagealloc_alloc(page, order, gfp_mask);

    trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

out:
    /*
     * When updating a task's mems_allowed, it is possible to race with
     * parallel threads in such a way that an allocation can fail while
     * the mask is being updated. If a page allocation is about to fail,
     * check if the cpuset changed during allocation and if so, retry.
     */
    if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
        goto retry_cpuset;--------------------------------------------------重试页面分配

    return page;
}

get_page_from_freelist遍历ac->zonelist中的zone，在里面寻找满足条件的zone，然后找到页面，返回。

static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                        const struct alloc_context *ac)
{
    struct zonelist *zonelist = ac->zonelist;
    struct zoneref *z;
    struct page *page = NULL;
    struct zone *zone;
    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
    int zlc_active = 0;        /* set if using zonelist_cache */
    int did_zlc_setup = 0;        /* just call zlc_setup() one time */
    bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
                (gfp_mask & __GFP_WRITE);
    int nr_fair_skipped = 0;
    bool zonelist_rescan;

zonelist_scan:-------------------------------------------------------------------开始检查ac->zonelist。
    zonelist_rescan = false;

    /*
     * Scan zonelist, looking for a zone with enough free.
     * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
     */
    for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,--------从zonelist给定的ac->high_zoneidx开始查找，返回的是zone。
                                ac->nodemask) {

...-----------------------------------------------------------------------------一系列检查条件，不满足跳出当前for循环，进入下一个zone。满足的进入水位检查。
        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];-----------------这里的alloc_flags包含ALLOC_WMARK_LOW
        if (!zone_watermark_ok(zone, order, mark,-------------------------------所以此处会检查zone的低水位，不满足则进行检查，或者尝试zone_reclaim。
                       ac->classzone_idx, alloc_flags)) {
            int ret;

            /* Checked here to keep the fast path fast */
            BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
            if (alloc_flags & ALLOC_NO_WATERMARKS)
                goto try_this_zone;
...
            ret = zone_reclaim(zone, gfp_mask, order);-------------------------通过zone_reclaim进行一些页面回收
            switch (ret) {
...
                default:
                /* did we reclaim enough */
                if (zone_watermark_ok(zone, order, mark,
                        ac->classzone_idx, alloc_flags))---------------------再次检查水位是否满足
                    goto try_this_zone;

                /*
                 * Failed to reclaim enough to meet watermark.
                 * Only mark the zone full if checking the min
                 * watermark or if we failed to reclaim just
                 * 1<<order pages or else the page allocator
                 * fastpath will prematurely mark zones full
                 * when the watermark is between the low and
                 * min watermarks.
                 */
                if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
                    ret == ZONE_RECLAIM_SOME)
                    goto this_zone_full;

                continue;
            }
        }

try_this_zone:---------------------------------------------------------------包括水位各种条件都满足之后，可以在此zone进行页面分配工作。
        page = buffered_rmqueue(ac->preferred_zone, zone, order,-------------从zone中进行页面分配工作
                        gfp_mask, ac->migratetype);
        if (page) {
            if (prep_new_page(page, order, gfp_mask, alloc_flags))
                goto try_this_zone;
            return page;
        }
this_zone_full:
        if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
            zlc_mark_zone_full(zonelist, z);
    }

    /*
     * The first pass makes sure allocations are spread fairly within the
     * local node.  However, the local node might have free pages left
     * after the fairness batches are exhausted, and remote zones haven't
     * even been considered yet.  Try once more without fairness, and
     * include remote zones now, before entering the slowpath and waking
     * kswapd: prefer spilling to a remote zone over swapping locally.
     */
    if (alloc_flags & ALLOC_FAIR) {
        alloc_flags &= ~ALLOC_FAIR;
        if (nr_fair_skipped) {
            zonelist_rescan = true;
            reset_alloc_batches(ac->preferred_zone);
        }
        if (nr_online_nodes > 1)
            zonelist_rescan = true;
    }

    if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
        /* Disable zlc cache for second zonelist scan */
        zlc_active = 0;
        zonelist_rescan = true;
    }

    if (zonelist_rescan)
        goto zonelist_scan;

    return NULL;
}

关于水位的计算在watermark中有详细介绍。

下面看看判断当前zone空闲页面是否满足alloc_flags指定水位的函数__zone_watermark_ok。

z-zone结构体，order待分配页面的阶数，mark水位数值，classzone_idx是zone序号，alloc_flags分配掩码，free_pages当前空闲页面数。

static bool __zone_watermark_ok(struct zone *z, unsigned int order,
            unsigned long mark, int classzone_idx, int alloc_flags,
            long free_pages)
{
    /* free_pages may go negative - that's OK */
    long min = mark;
    int o;
    long free_cma = 0;

    free_pages -= (1 << order) - 1;---------------------------------------------减去待分配页面后剩余页面数，-1？？
    if (alloc_flags & ALLOC_HIGH)
        min -= min / 2;
    if (alloc_flags & ALLOC_HARDER)
        min -= min / 4;
...
    if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])--------空闲页面数要保证大于min值和lowmem_resreve保留值之和
        return false;
    for (o = 0; o < order; o++) {-----------------------------------------------遍历buddy中比当前请求分配order小的所有order，依次检查free pages是否满足watermark需求
        /* At the next order, this order's pages become unavailable */
        free_pages -= z->free_area[o].nr_free << o;-----------------------------从总free_pages种减去当前order的free pages

        /* Require fewer higher order pages to be free */
        min >>= 1;--------------------------------------------------------------水位值缩半

        if (free_pages <= min)--------------------------------------------------在比较是否满足水位需求
            return false;
    }
    return true;----------------------------------------------------------------以上所有条件都满足，返回True
}

函数中循环的目的可归结为：
依次循环，检查内存中是否有足够多的大块（即order比较高）空闲内存。
每次循环处理中，先把当前order的free page从总free pages中减掉，因为我们是看是否有足够多的大块内存。
当然，既然已经把free pages中的一部分已经划掉了，比较标准也应该相应放宽。
放宽多少，就是前面说的对min的右移处理。

参考：__zone_watermark_ok分析

zone_reclaim:

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
    int node_id;
    int ret;

    /*
     * Zone reclaim reclaims unmapped file backed pages and
     * slab pages if we are over the defined limits.
     *
     * A small portion of unmapped file backed pages is needed for
     * file I/O otherwise pages read by file I/O will be immediately
     * thrown out if the zone is overallocated. So we do not reclaim
     * if less than a specified percentage of the zone is used by
     * unmapped file backed pages.
     */
    if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
        zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
        return ZONE_RECLAIM_FULL;

    if (!zone_reclaimable(zone))
        return ZONE_RECLAIM_FULL;

    /*
     * Do not scan if the allocation should not be delayed.
     */
    if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
        return ZONE_RECLAIM_NOSCAN;

    /*
     * Only run zone reclaim on the local zone or on zones that do not
     * have associated processors. This will favor the local processor
     * over remote processors and spread off node memory allocations
     * as wide as possible.
     */
    node_id = zone_to_nid(zone);
    if (node_state(node_id, N_CPU) && node_id != numa_node_id())
        return ZONE_RECLAIM_NOSCAN;

    if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
        return ZONE_RECLAIM_NOSCAN;

    ret = __zone_reclaim(zone, gfp_mask, order);
    clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);

    if (!ret)
        count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

    return ret;
}

buffered_rmqueue:

/*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
 */
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
            struct zone *zone, unsigned int order,
            gfp_t gfp_flags, int migratetype)
{
    unsigned long flags;
    struct page *page;
    bool cold = ((gfp_flags & __GFP_COLD) != 0);

    if (likely(order == 0)) {
        struct per_cpu_pages *pcp;
        struct list_head *list;

        local_irq_save(flags);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        list = &pcp->lists[migratetype];
        if (list_empty(list)) {
            pcp->count += rmqueue_bulk(zone, 0,
                    pcp->batch, list,
                    migratetype, cold);
            if (unlikely(list_empty(list)))
                goto failed;
        }

        if (cold)
            page = list_entry(list->prev, struct page, lru);
        else
            page = list_entry(list->next, struct page, lru);

        list_del(&page->lru);
        pcp->count--;
    } else {
        if (unlikely(gfp_flags & __GFP_NOFAIL)) {
            /*
             * __GFP_NOFAIL is not to be used in new code.
             *
             * All __GFP_NOFAIL callers should be fixed so that they
             * properly detect and handle allocation failures.
             *
             * We most definitely don't want callers attempting to
             * allocate greater than order-1 page units with
             * __GFP_NOFAIL.
             */
            WARN_ON_ONCE(order > 1);
        }
        spin_lock_irqsave(&zone->lock, flags);
        page = __rmqueue(zone, order, migratetype);
        spin_unlock(&zone->lock);
        if (!page)
            goto failed;
        __mod_zone_freepage_state(zone, -(1 << order),
                      get_freepage_migratetype(page));
    }

    __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
    if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
        !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
        set_bit(ZONE_FAIR_DEPLETED, &zone->flags);

    __count_zone_vm_events(PGALLOC, zone, 1 << order);
    zone_statistics(preferred_zone, zone, gfp_flags);
    local_irq_restore(flags);

    VM_BUG_ON_PAGE(bad_range(zone, page), page);
    return page;

failed:
    local_irq_restore(flags);
    return NULL;
}

3. 释放页面

__free_page
free_page-->free_pages
    __free_pages
        free_hot_cold_page
        __free_pages_ok

4. 伙伴系统相关节点

4.1 /proc/pagetypeinfo

Page block order: 10
Pages per block:  1024

Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10 
Node    0, zone   Normal, type    Unmovable    243    105     26      7      2      0      1      0      0      0      0 
Node    0, zone   Normal, type  Reclaimable      1      1      0      2      0      0      0      1      1      1      0 
Node    0, zone   Normal, type      Movable      4      2      3      4      4      2      3      3      2      2    156 
Node    0, zone   Normal, type      Reserve      0      0      0      0      0      0      0      0      0      0      1 
Node    0, zone   Normal, type          CMA      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone   Normal, type      Isolate      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone  HighMem, type    Unmovable      1      1      1      0      1      0      0      1      1      1      0 
Node    0, zone  HighMem, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone  HighMem, type      Movable      1      0      1      0      1      0      1      1      1      0     63 
Node    0, zone  HighMem, type      Reserve      0      0      0      0      0      0      0      0      0      0      1 
Node    0, zone  HighMem, type          CMA      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone  HighMem, type      Isolate      0      0      0      0      0      0      0      0      0      0      0 

Number of blocks type     Unmovable  Reclaimable      Movable      Reserve          CMA      Isolate 
Node 0, zone   Normal            6           19          164            1            0            0 
Node 0, zone  HighMem            1            0           64            1            0            0

查看全文

相关阅读:
虚拟机LUN扩大后,重新分区
 rm: cannot remove `/home/cn0000/log/formlog.20140417': Read-only file system
sybase 备份和恢复
 全表扫描计算成本
 标量子查询子查询执行次数计算公式
 left join 关联条件位置
 动态SQL使用绑定变量
 如何在SQLServer中处理每天四亿三千万记录的
 移动端UI一致性解决方案
 工业互联网中MES系统的重要性

原文地址：https://www.cnblogs.com/arnoldlu/p/8250734.html