zoukankan      html  css  js  c++  java
  • Linux内存管理 (4)分配物理页面

    专题:Linux内存管理专题

    关键词:分配掩码、伙伴系统、水位(watermark)、空闲伙伴块合并

    我们知道Linux内存管理是以页为单位进行的,对内存的管理是通过伙伴系统进行。

    Linux内存管理框架图可知,页面分配器是其他林林总总内存操作的基础。

    这也是为什么在介绍了《Linux内存管理 (1)物理内存初始化》、《Linux内存管理 (2)页表的映射过程》、《Linux内存管理 (3)内核内存的布局图》之后,紧接着就要弄明白页面分配器的原因。

    1. 重要数据结构

    1.1 页面分配掩码

    alloc_pages是内核中常用的分配物理内存页面的接口函数,他有两个参数,其中一个就是分配掩码。

    includelinuxgfp.h存放了GFP(Get Free Page)分配掩码,分配掩码可以分为两类:以__GFP_开头的分配掩码;以GFP_开头的一般是__GFP_的组合。

    __GFP_掩码分为两大类:zone modifiers和action modifiers。

    zone modifiers是掩码的低4位,用来指定从那个zone分配页面。

    action modifiers定义了分配页面的属性

    /* Plain integer GFP bitmasks. Do not use this directly. */
    #define ___GFP_DMA        0x01u
    #define ___GFP_HIGHMEM        0x02u
    #define ___GFP_DMA32        0x04u
    #define ___GFP_MOVABLE        0x08u
    #define ___GFP_WAIT        0x10u
    #define ___GFP_HIGH        0x20u
    #define ___GFP_IO        0x40u
    #define ___GFP_FS        0x80u
    #define ___GFP_COLD        0x100u
    #define ___GFP_NOWARN        0x200u
    #define ___GFP_REPEAT        0x400u
    #define ___GFP_NOFAIL        0x800u
    #define ___GFP_NORETRY        0x1000u
    #define ___GFP_MEMALLOC        0x2000u
    #define ___GFP_COMP        0x4000u
    #define ___GFP_ZERO        0x8000u
    #define ___GFP_NOMEMALLOC    0x10000u
    #define ___GFP_HARDWALL        0x20000u
    #define ___GFP_THISNODE        0x40000u
    #define ___GFP_RECLAIMABLE    0x80000u
    #define ___GFP_NOTRACK        0x200000u
    #define ___GFP_NO_KSWAPD    0x400000u
    #define ___GFP_OTHER_NODE    0x800000u
    #define ___GFP_WRITE        0x1000000u
    /* If the above are modified, __GFP_BITS_SHIFT may need updating */

    在实际使用中多使用GFP_开头的掩码:

    /* This equals 0, but use constants in case they ever change */
    #define GFP_NOWAIT    (GFP_ATOMIC & ~__GFP_HIGH)
    /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
    #define GFP_ATOMIC    (__GFP_HIGH)
    #define GFP_NOIO    (__GFP_WAIT)
    #define GFP_NOFS    (__GFP_WAIT | __GFP_IO)
    #define GFP_KERNEL    (__GFP_WAIT | __GFP_IO | __GFP_FS)
    #define GFP_TEMPORARY    (__GFP_WAIT | __GFP_IO | __GFP_FS | 
                 __GFP_RECLAIMABLE)
    #define GFP_USER    (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
    #define GFP_HIGHUSER    (GFP_USER | __GFP_HIGHMEM)
    #define GFP_HIGHUSER_MOVABLE    (GFP_HIGHUSER | __GFP_MOVABLE)
    #define GFP_IOFS    (__GFP_IO | __GFP_FS)
    #define GFP_TRANSHUGE    (GFP_HIGHUSER_MOVABLE | __GFP_COMP | 
                 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | 
                 __GFP_NO_KSWAPD)
    
    /*
     * GFP_THISNODE does not perform any reclaim, you most likely want to
     * use __GFP_THISNODE to allocate from a given node without fallback!
     */
    #ifdef CONFIG_NUMA
    #define GFP_THISNODE    (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
    #else
    #define GFP_THISNODE    ((__force gfp_t)0)
    #endif
    
    /* This mask makes up all the page movable related flags */
    #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
    
    /* Control page allocator reclaim behavior */
    #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|
                __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|
                __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
    
    /* Control slab gfp mask during early boot */
    #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))
    
    /* Control allocation constraints */
    #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
    
    /* Do not use these with a slab allocator */
    #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
    
    /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
       platforms, used as appropriate on others */
    
    #define GFP_DMA        __GFP_DMA
    
    /* 4GB DMA on some platforms */
    #define GFP_DMA32    __GFP_DMA32

    2.  伙伴系统分配内存

    alloc_page-------------------------------分配单页
    get_zeroed_page-->__get_free_pages
        alloc_pages--------------------------分配2^odrder个页面
            alloc_pages_node-----------------增加node id参数
                __alloc_pages
                    __alloc_pages_node_mask--增加nodemaks参数   

     __alloc_pages_nodemask is the 'heart' of the zoned buddy allocator.

    首先__alloc_pages_nodemask很重要,其次说明了这里的伙伴页面分配器是基于Zone的

     struct alloc_context是伙伴系统分配函数中用于保存相关参数的数据结构。

    struct alloc_context {
        struct zonelist *zonelist;
        nodemask_t *nodemask;
        struct zone *preferred_zone;
        int classzone_idx;
        int migratetype;
        enum zone_type high_zoneidx;
    };

     这里的zonelist,已经通过node_zonelist(nid, gfp_mask)得到:zonelist=NODE_DATA(nid)->node_zonelists+gfp_zonelist(flags)

    struct page *
    __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                struct zonelist *zonelist, nodemask_t *nodemask)
    {
        struct zoneref *preferred_zoneref;
        struct page *page = NULL;
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
        gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
        struct alloc_context ac = {
            .high_zoneidx = gfp_zone(gfp_mask),----------------------------------gfp_zone根据gfp_mask低4位,找到对应的zone_type。ZONE_NORMAL?ZONE_HIGHMEM?
            .nodemask = nodemask,
            .migratetype = gfpflags_to_migratetype(gfp_mask),--------------------根据gfp_mask得出页面migratetype,是MIGRATE_RECLAIMABLE?MIGRATE_MOVABLE?
        };
    
        gfp_mask &= gfp_allowed_mask;
    
        lockdep_trace_alloc(gfp_mask);
    
        might_sleep_if(gfp_mask & __GFP_WAIT);
    
        if (should_fail_alloc_page(gfp_mask, order))
            return NULL;
    
        /*
         * Check the zones suitable for the gfp_mask contain at least one
         * valid zone. It's possible to have an empty zonelist as a result
         * of GFP_THISNODE and a memoryless node
         */
        if (unlikely(!zonelist->_zonerefs->zone))
            return NULL;
    
        if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
            alloc_flags |= ALLOC_CMA;
    
    retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
    
        /* We set it here, as __alloc_pages_slowpath might have changed it */
        ac.zonelist = zonelist;
        /* The preferred zone is used for statistics later */
        preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                    ac.nodemask ? : &cpuset_current_mems_allowed,
                    &ac.preferred_zone);
        if (!ac.preferred_zone)
            goto out;
        ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
    
        /* First allocation attempt */
        alloc_mask = gfp_mask|__GFP_HARDWALL;
        page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);---------尝试分配物理页面
        if (unlikely(!page)) {
            /*
             * Runtime PM, block IO and its error handling path
             * can deadlock because I/O on the device might not
             * complete.
             */
            alloc_mask = memalloc_noio_flags(gfp_mask);
    
            page = __alloc_pages_slowpath(alloc_mask, order, &ac);-----------------如果分配失败,则在这里进行很多特殊场景的处理。
        }
    
        if (kmemcheck_enabled && page)
            kmemcheck_pagealloc_alloc(page, order, gfp_mask);
    
        trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
    
    out:
        /*
         * When updating a task's mems_allowed, it is possible to race with
         * parallel threads in such a way that an allocation can fail while
         * the mask is being updated. If a page allocation is about to fail,
         * check if the cpuset changed during allocation and if so, retry.
         */
        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
            goto retry_cpuset;--------------------------------------------------重试页面分配
    
        return page;
    }

     get_page_from_freelist遍历ac->zonelist中的zone,在里面寻找满足条件的zone,然后找到页面,返回。

    static struct page *
    get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                            const struct alloc_context *ac)
    {
        struct zonelist *zonelist = ac->zonelist;
        struct zoneref *z;
        struct page *page = NULL;
        struct zone *zone;
        nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
        int zlc_active = 0;        /* set if using zonelist_cache */
        int did_zlc_setup = 0;        /* just call zlc_setup() one time */
        bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
                    (gfp_mask & __GFP_WRITE);
        int nr_fair_skipped = 0;
        bool zonelist_rescan;
    
    zonelist_scan:-------------------------------------------------------------------开始检查ac->zonelist。
        zonelist_rescan = false;
    
        /*
         * Scan zonelist, looking for a zone with enough free.
         * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
         */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,--------从zonelist给定的ac->high_zoneidx开始查找,返回的是zone。
                                    ac->nodemask) {
    
    ...-----------------------------------------------------------------------------一系列检查条件,不满足跳出当前for循环,进入下一个zone。满足的进入水位检查。
            mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];-----------------这里的alloc_flags包含ALLOC_WMARK_LOW
            if (!zone_watermark_ok(zone, order, mark,-------------------------------所以此处会检查zone的低水位,不满足则进行检查,或者尝试zone_reclaim。
                           ac->classzone_idx, alloc_flags)) {
                int ret;
    
                /* Checked here to keep the fast path fast */
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (alloc_flags & ALLOC_NO_WATERMARKS)
                    goto try_this_zone;
    ...
                ret = zone_reclaim(zone, gfp_mask, order);-------------------------通过zone_reclaim进行一些页面回收
                switch (ret) {
    ...
    default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, ac->classzone_idx, alloc_flags))---------------------再次检查水位是否满足 goto try_this_zone; /* * Failed to reclaim enough to meet watermark. * Only mark the zone full if checking the min * watermark or if we failed to reclaim just * 1<<order pages or else the page allocator * fastpath will prematurely mark zones full * when the watermark is between the low and * min watermarks. */ if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || ret == ZONE_RECLAIM_SOME) goto this_zone_full; continue; } } try_this_zone:---------------------------------------------------------------包括水位各种条件都满足之后,可以在此zone进行页面分配工作。 page = buffered_rmqueue(ac->preferred_zone, zone, order,-------------从zone中进行页面分配工作 gfp_mask, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) goto try_this_zone; return page; } this_zone_full: if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } /* * The first pass makes sure allocations are spread fairly within the * local node. However, the local node might have free pages left * after the fairness batches are exhausted, and remote zones haven't * even been considered yet. Try once more without fairness, and * include remote zones now, before entering the slowpath and waking * kswapd: prefer spilling to a remote zone over swapping locally. */ if (alloc_flags & ALLOC_FAIR) { alloc_flags &= ~ALLOC_FAIR; if (nr_fair_skipped) { zonelist_rescan = true; reset_alloc_batches(ac->preferred_zone); } if (nr_online_nodes > 1) zonelist_rescan = true; } if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { /* Disable zlc cache for second zonelist scan */ zlc_active = 0; zonelist_rescan = true; } if (zonelist_rescan) goto zonelist_scan; return NULL; }

     关于水位的计算在watermark中有详细介绍。

    下面看看判断当前zone空闲页面是否满足alloc_flags指定水位的函数__zone_watermark_ok。

    z-zone结构体,order待分配页面的阶数,mark水位数值,classzone_idx是zone序号,alloc_flags分配掩码,free_pages当前空闲页面数。

    static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int classzone_idx, int alloc_flags,
                long free_pages)
    {
        /* free_pages may go negative - that's OK */
        long min = mark;
        int o;
        long free_cma = 0;
    
        free_pages -= (1 << order) - 1;---------------------------------------------减去待分配页面后剩余页面数,-1??
        if (alloc_flags & ALLOC_HIGH)
            min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
            min -= min / 4;
    ...
        if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])--------空闲页面数要保证大于min值和lowmem_resreve保留值之和
            return false;
        for (o = 0; o < order; o++) {-----------------------------------------------遍历buddy中比当前请求分配order小的所有order,依次检查free pages是否满足watermark需求
            /* At the next order, this order's pages become unavailable */
            free_pages -= z->free_area[o].nr_free << o;-----------------------------从总free_pages种减去当前order的free pages
    
            /* Require fewer higher order pages to be free */
            min >>= 1;--------------------------------------------------------------水位值缩半
    
            if (free_pages <= min)--------------------------------------------------在比较是否满足水位需求
                return false;
        }
        return true;----------------------------------------------------------------以上所有条件都满足,返回True
    }

    函数中循环的目的可归结为:
    依次循环,检查内存中是否有足够多的大块(即order比较高)空闲内存。
    每次循环处理中,先把当前order的free page从总free pages中减掉,因为我们是看是否有足够多的大块内存。
    当然,既然已经把free pages中的一部分已经划掉了,比较标准也应该相应放宽。
    放宽多少,就是前面说的对min的右移处理。

    参考:__zone_watermark_ok分析

    zone_reclaim:

    int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
    {
        int node_id;
        int ret;
    
        /*
         * Zone reclaim reclaims unmapped file backed pages and
         * slab pages if we are over the defined limits.
         *
         * A small portion of unmapped file backed pages is needed for
         * file I/O otherwise pages read by file I/O will be immediately
         * thrown out if the zone is overallocated. So we do not reclaim
         * if less than a specified percentage of the zone is used by
         * unmapped file backed pages.
         */
        if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
            zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
            return ZONE_RECLAIM_FULL;
    
        if (!zone_reclaimable(zone))
            return ZONE_RECLAIM_FULL;
    
        /*
         * Do not scan if the allocation should not be delayed.
         */
        if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
            return ZONE_RECLAIM_NOSCAN;
    
        /*
         * Only run zone reclaim on the local zone or on zones that do not
         * have associated processors. This will favor the local processor
         * over remote processors and spread off node memory allocations
         * as wide as possible.
         */
        node_id = zone_to_nid(zone);
        if (node_state(node_id, N_CPU) && node_id != numa_node_id())
            return ZONE_RECLAIM_NOSCAN;
    
        if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
            return ZONE_RECLAIM_NOSCAN;
    
        ret = __zone_reclaim(zone, gfp_mask, order);
        clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
    
        if (!ret)
            count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
    
        return ret;
    }

    buffered_rmqueue:

    /*
     * Allocate a page from the given zone. Use pcplists for order-0 allocations.
     */
    static inline
    struct page *buffered_rmqueue(struct zone *preferred_zone,
                struct zone *zone, unsigned int order,
                gfp_t gfp_flags, int migratetype)
    {
        unsigned long flags;
        struct page *page;
        bool cold = ((gfp_flags & __GFP_COLD) != 0);
    
        if (likely(order == 0)) {
            struct per_cpu_pages *pcp;
            struct list_head *list;
    
            local_irq_save(flags);
            pcp = &this_cpu_ptr(zone->pageset)->pcp;
            list = &pcp->lists[migratetype];
            if (list_empty(list)) {
                pcp->count += rmqueue_bulk(zone, 0,
                        pcp->batch, list,
                        migratetype, cold);
                if (unlikely(list_empty(list)))
                    goto failed;
            }
    
            if (cold)
                page = list_entry(list->prev, struct page, lru);
            else
                page = list_entry(list->next, struct page, lru);
    
            list_del(&page->lru);
            pcp->count--;
        } else {
            if (unlikely(gfp_flags & __GFP_NOFAIL)) {
                /*
                 * __GFP_NOFAIL is not to be used in new code.
                 *
                 * All __GFP_NOFAIL callers should be fixed so that they
                 * properly detect and handle allocation failures.
                 *
                 * We most definitely don't want callers attempting to
                 * allocate greater than order-1 page units with
                 * __GFP_NOFAIL.
                 */
                WARN_ON_ONCE(order > 1);
            }
            spin_lock_irqsave(&zone->lock, flags);
            page = __rmqueue(zone, order, migratetype);
            spin_unlock(&zone->lock);
            if (!page)
                goto failed;
            __mod_zone_freepage_state(zone, -(1 << order),
                          get_freepage_migratetype(page));
        }
    
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
        if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
            !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
            set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
    
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
        local_irq_restore(flags);
    
        VM_BUG_ON_PAGE(bad_range(zone, page), page);
        return page;
    
    failed:
        local_irq_restore(flags);
        return NULL;
    }

    3. 释放页面 

      

    __free_page
    free_page-->free_pages
        __free_pages
            free_hot_cold_page
            __free_pages_ok

    4. 伙伴系统相关节点

    4.1 /proc/pagetypeinfo

    Page block order: 10
    Pages per block:  1024
    
    Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10 
    Node    0, zone   Normal, type    Unmovable    243    105     26      7      2      0      1      0      0      0      0 
    Node    0, zone   Normal, type  Reclaimable      1      1      0      2      0      0      0      1      1      1      0 
    Node    0, zone   Normal, type      Movable      4      2      3      4      4      2      3      3      2      2    156 
    Node    0, zone   Normal, type      Reserve      0      0      0      0      0      0      0      0      0      0      1 
    Node    0, zone   Normal, type          CMA      0      0      0      0      0      0      0      0      0      0      0 
    Node    0, zone   Normal, type      Isolate      0      0      0      0      0      0      0      0      0      0      0 
    Node    0, zone  HighMem, type    Unmovable      1      1      1      0      1      0      0      1      1      1      0 
    Node    0, zone  HighMem, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0 
    Node    0, zone  HighMem, type      Movable      1      0      1      0      1      0      1      1      1      0     63 
    Node    0, zone  HighMem, type      Reserve      0      0      0      0      0      0      0      0      0      0      1 
    Node    0, zone  HighMem, type          CMA      0      0      0      0      0      0      0      0      0      0      0 
    Node    0, zone  HighMem, type      Isolate      0      0      0      0      0      0      0      0      0      0      0 
    
    Number of blocks type     Unmovable  Reclaimable      Movable      Reserve          CMA      Isolate 
    Node 0, zone   Normal            6           19          164            1            0            0 
    Node 0, zone  HighMem            1            0           64            1            0            0 
  • 相关阅读:
    博客园精华集汇总
    SQLServer中临时表与表变量的区别分析
    【转】InstantClient安装使用
    博客人生
    【Sniffer】如何查看Sniffer截获的数据内容
    Excel使用基础
    书和耳机到了
    MindManager Pro 9.1.157更改默认字体
    xml spy中显示文件路径
    【LR】Error 27191: "web_image_check" failed 报错解决方法
  • 原文地址:https://www.cnblogs.com/arnoldlu/p/8250734.html
Copyright © 2011-2022 走看看