zoukankan      html  css  js  c++  java
  • 构建借用内存的后台

    Linux管理内存分阶段抽象,用数据结构管理。先用节点集合管理内存,然后用zone的集合管理节点,再用页的集合管理zone.

    pglist_data结构描述节点

    typedef struct pglist_data {
        struct zone node_zones[MAX_NR_ZONES];
        struct zonelist node_zonelists[MAX_ZONELISTS];
        int nr_zones;
    #ifdef CONFIG_FLAT_NODE_MEM_MAP    /* means !SPARSEMEM */
        struct page *node_mem_map;
    #ifdef CONFIG_CGROUP_MEM_RES_CTLR
        struct page_cgroup *node_page_cgroup;
    #endif
    #endif
        struct bootmem_data *bdata;
    #ifdef CONFIG_MEMORY_HOTPLUG
        /*
         * Must be held any time you expect node_start_pfn, node_present_pages
         * or node_spanned_pages stay constant.  Holding this will also
         * guarantee that any pfn_valid() stays that way.
         *
         * Nests above zone->lock and zone->size_seqlock.
         */
        spinlock_t node_size_lock;
    #endif
        unsigned long node_start_pfn;
        unsigned long node_present_pages; /* total number of physical pages */
        unsigned long node_spanned_pages; /* total size of physical page
                             range, including holes */
        int node_id;
        wait_queue_head_t kswapd_wait;
        struct task_struct *kswapd;
        int kswapd_max_order;
    } pg_data_t;
    struct pglist_data

    zone结构体描述zone

    struct zone{
        /* Fields commonly accessed by the page allocator */
        unsigned long        pages_min, pages_low, pages_high;
        /*
         * We don't know if the memory that we're going to allocate will be freeable
         * or/and it will be released eventually, so to avoid totally wasting several
         * GB of ram we must reserve some of the lower zone memory (otherwise we risk
         * to run OOM on the lower zones despite there's tons of freeable ram
         * on the higher zones). This array is recalculated at runtime if the
         * sysctl_lowmem_reserve_ratio sysctl changes.
         */
        unsigned long        lowmem_reserve[MAX_NR_ZONES];
    
    #ifdef CONFIG_NUMA
        int node;
        /*
         * zone reclaim becomes active if more unmapped pages exist.
         */
        unsigned long        min_unmapped_pages;
        unsigned long        min_slab_pages;
        struct per_cpu_pageset    *pageset[NR_CPUS];
    #else
        struct per_cpu_pageset    pageset[NR_CPUS];
    #endif
        /*
         * free areas of different sizes
         */
        spinlock_t        lock;
    #ifdef CONFIG_MEMORY_HOTPLUG
        /* see spanned/present_pages for more description */
        seqlock_t        span_seqlock;
    #endif
        struct free_area    free_area[MAX_ORDER];
    
    #ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
         * In SPARSEMEM, this map is stored in struct mem_section
         */
        unsigned long        *pageblock_flags;
    #endif /* CONFIG_SPARSEMEM */
    
    
        ZONE_PADDING(_pad1_)
    
        /* Fields commonly accessed by the page reclaim scanner */
        spinlock_t        lru_lock;    
        struct {
            struct list_head list;
            unsigned long nr_scan;
        } lru[NR_LRU_LISTS];
    
        struct zone_reclaim_stat reclaim_stat;
    
        unsigned long        pages_scanned;       /* since last reclaim */
        unsigned long        flags;           /* zone flags, see below */
    
        /* Zone statistics */
        atomic_long_t        vm_stat[NR_VM_ZONE_STAT_ITEMS];
    
        /*
         * prev_priority holds the scanning priority for this zone.  It is
         * defined as the scanning priority at which we achieved our reclaim
         * target at the previous try_to_free_pages() or balance_pgdat()
         * invokation.
         *
         * We use prev_priority as a measure of how much stress page reclaim is
         * under - it drives the swappiness decision: whether to unmap mapped
         * pages.
         *
         * Access to both this field is quite racy even on uniprocessor.  But
         * it is expected to average out OK.
         */
        int prev_priority;
    
        /*
         * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
         * this zone's LRU.  Maintained by the pageout code.
         */
        unsigned int inactive_ratio;
    
    
        ZONE_PADDING(_pad2_)
        /* Rarely used or read-mostly fields */
    
        /*
         * wait_table        -- the array holding the hash table
         * wait_table_hash_nr_entries    -- the size of the hash table array
         * wait_table_bits    -- wait_table_size == (1 << wait_table_bits)
         *
         * The purpose of all these is to keep track of the people
         * waiting for a page to become available and make them
         * runnable again when possible. The trouble is that this
         * consumes a lot of space, especially when so few things
         * wait on pages at a given time. So instead of using
         * per-page waitqueues, we use a waitqueue hash table.
         *
         * The bucket discipline is to sleep on the same queue when
         * colliding and wake all in that wait queue when removing.
         * When something wakes, it must check to be sure its page is
         * truly available, a la thundering herd. The cost of a
         * collision is great, but given the expected load of the
         * table, they should be so rare as to be outweighed by the
         * benefits from the saved space.
         *
         * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
         * primary users of these fields, and in mm/page_alloc.c
         * free_area_init_core() performs the initialization of them.
         */
        wait_queue_head_t    * wait_table;
        unsigned long        wait_table_hash_nr_entries;
        unsigned long        wait_table_bits;
    
        /*
         * Discontig memory support fields.
         */
        struct pglist_data    *zone_pgdat;
        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long        zone_start_pfn;
    
        /*
         * zone_start_pfn, spanned_pages and present_pages are all
         * protected by span_seqlock.  It is a seqlock because it has
         * to be read outside of zone->lock, and it is done in the main
         * allocator path.  But, it is written quite infrequently.
         *
         * The lock is declared along with zone->lock because it is
         * frequently read in proximity to zone->lock.  It's good to
         * give them a chance of being in the same cacheline.
         */
        unsigned long        spanned_pages;    /* total size, including holes */
        unsigned long        present_pages;    /* amount of memory (excluding holes) */
    
        /*
         * rarely used fields:
         */
        const char        *name;
    }
    struct zone

    1.构建借用内存的结构体

    这里主要的结构体是struct node_zonelists

    /*
     * One allocation request operates on a zonelist. A zonelist
     * is a list of zones, the first one is the 'goal' of the
     * allocation, the other zones are fallback zones, in decreasing
     * priority.
     *
     * If zlcache_ptr is not NULL, then it is just the address of zlcache,
     * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
     * *
     * To speed the reading of the zonelist, the zonerefs contain the zone index
     * of the entry being read. Helper functions to access information given
     * a struct zoneref are
     *
     * zonelist_zone()    - Return the struct zone * for an entry in _zonerefs
     * zonelist_zone_idx()    - Return the index of the zone for an entry
     * zonelist_node_idx()    - Return the index of the node for an entry
     */
    struct zonelist {
        struct zonelist_cache *zlcache_ptr;             // NULL or &zlcache
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
    #ifdef CONFIG_NUMA
        struct zonelist_cache zlcache;                 // optional ...
    #endif
    };
    struct zonelist

    相关的结构体有struct zoneref

    struct zoneref {
        struct zone *zone;    /* Pointer to actual zone */
        int zone_idx;        /* zone_idx(zoneref->zone) */
    };

    struct zonelist_cache

    #ifdef CONFIG_NUMA
    
    /*
     * The NUMA zonelists are doubled becausse we need zonelists that restrict the
     * allocations to a single node for GFP_THISNODE.
     *
     * [0]    : Zonelist with fallback
     * [1]    : No fallback (GFP_THISNODE)
     */
    #define MAX_ZONELISTS 2
    
    struct zonelist_cache {
        unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];        /* zone->nid */
        DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);    /* zone full? */
        unsigned long last_full_zap;        /* when last zap'd (jiffies) */
    };
    #else
    #define MAX_ZONELISTS 1
    struct zonelist_cache;
    #endif

    pg_data_t->node_zonelists[MAX_ZONELISTS]结构体中MAX_ZONELISTS根据CONFIG_NUMA配置不同。

    配置NUMA时,内存有多个节点,node_zonelists[0]保存备份列表,node_zonelists[1]构建的是相应节点的zone列表。

    1.set_zonelist_order

    借用内存的两种策略,节点优先,先可本节点的内存分配,本节点无可用内存在用其它节点的内存,速度优先;

    zone优先,先可低成本的zone的内存分配,本节点没有到其它节点同zone下分配内存,都没有在找高成本的zone分配,可靠性优先。

    /*
     *  zonelist_order:
     *  0 = automatic detection of better ordering.
     *  1 = order by ([node] distance, -zonetype)
     *  2 = order by (-zonetype, [node] distance)
     *
     *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
     *  the same zonelist. So only NUMA can configure this param.
     */
    #define ZONELIST_ORDER_DEFAULT  0
    #define ZONELIST_ORDER_NODE     1
    #define ZONELIST_ORDER_ZONE     2

    对应UMA类型的内存,就一个节点,只能选ZONELIST_ORDER_ZONE.

    static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;是mm/page_alloc.c中定义的全局变量。

    static void set_zonelist_order(void)
    {
        current_zonelist_order = ZONELIST_ORDER_ZONE;
    }

    对于NUMA类型的内存,有多个节点,借用内存的类型要根据不同zone中内存大小的分布来决定。

    static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
    static
    void set_zonelist_order(void) { if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) current_zonelist_order = default_zonelist_order(); else current_zonelist_order = user_zonelist_order; }

    如果没哟DMA zone或者DMA zone的内存比较多,选择节点顺序ZONELIST_ORDER_NODE。否则选择ZONE顺序ZONELIST_ORDER_ZONE。

     2.__build_all_zonelists

    构建备用列表的主要工作是在__build_all_zonelists函数中实现的。

    static int __build_all_zonelists(void *dummy)
    {
        int nid;
    
        for_each_online_node(nid) {
            pg_data_t *pgdat = NODE_DATA(nid);
    
            build_zonelists(pgdat);
            build_zonelist_cache(pgdat);
        }
        return 0;
    }

    2.1build_zonelists函数即完成pg_data_t->node_zonelists[0]备份列表的初始化,同时完成pg_data_t->node_zonelists[1]自身节点的zone列表的初始化。

    以上主要完成zonelists->_zonerefs数组的初始化,再次把zoneref结构体的定义贴在这里

    struct zoneref {
        struct zone *zone;    /* Pointer to actual zone */
        int zone_idx;        /* zone_idx(zoneref->zone) */
    };

    最终的设置zoneref结构体的函数是zoneref_set_zone。

    static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
    {
        zoneref->zone = zone;
        zoneref->zone_idx = zone_idx(zone);
    }

    2.2 build_zonelist_cache主要完成node_zonelists[0]->zlcache,即zonelist_cache结构体的初始化。

    这里把zonelist_cache结构体的定义及build_zonelist_cache函数贴在这里。这个结构体的作用主要是为了提高查找效率。

    struct zonelist_cache {
        unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];        /* zone->nid */
        DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);    /* zone full? */
        unsigned long last_full_zap;        /* when last zap'd (jiffies) */
    };
    
    /* Construct the zonelist performance cache - see further mmzone.h */
    static void build_zonelist_cache(pg_data_t *pgdat)
    {
        struct zonelist *zonelist;
        struct zonelist_cache *zlc;
        struct zoneref *z;
    
        zonelist = &pgdat->node_zonelists[0];
        zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
        bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
        for (z = zonelist->_zonerefs; z->zone; z++)
            zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
    }

    3.mminit_verify_zonelist()

    输出系统内所有节点的备份列表信息。这里解释其中一个关键的函数for_each_zone_zonelist(zone, z, zonelist, zoneid)遍历zonelist的过程。

    3.1 for循环的主体

    /* 
    Iterate the zonelist
    struct zone *zone;  @zone - The current zone in the iterator.zone = &pgdat->node_zones[zoneid];
    struct zoneref *z;  @z - The current pointer within zonelist->zones being iterated
    struct zonelist *zonelist; 固定的指针,指向gd_data的备用列表
    int zoneid;   固定的数值,用于比较
     */
    for_each_zone_zonelist(zone, z, zonelist, zoneid) 
        -->for_each_zone_zonelist_nodemask(zone, z, zonelist, zoneid, NULL)
            -->for (  z = first_zones_zonelist(zonelist, zoneid, NULL, &zone);    
                  zone;                            
                  z = next_zones_zonelist(++z, zoneid, NULL, &zone)        )    

    3.2 循环初始化

    /*
    struct zonelist *zonelist; 固定的指针,指向gd_data的备用列表
    struct zoneref *z;  @z - The current pointer within zonelist->zones being iterated
    struct zone *zone;  @zone - The current zone in the iterator.zone = &pgdat->node_zones[zoneid];
    int zoneid;   固定的数值,用于比较
    */
    z=first_zones_zonelist(zonelist, zoneid, NULL, &zone)
        -->next_zones_zonelist(zonelist->_zonerefs, zoneid, NULL, &zone); //返回指向zoneref结构的指针
        

    3.3循环条件变化

    z = next_zones_zonelist(++z, zoneid, NULL, &zone)
        -->while (zonelist_zone_idx(z) > zoneid)    z++;
        -->*zone = zonelist_zone(z);
        -->return z;
  • 相关阅读:
    【C++和C#的区别杂谈】后自增运算符的结算时机
    个人作业——软件工程实践总结&个人技术博客
    Unity常见的三种数据本地持久化方案
    C++的逗号运算符
    米哈游--2020春招实习
    厦门飞鱼科技--2020春招实习
    tap4fun(成都尼必鲁)--2020春招实习
    腾讯IEG--2020春招实习
    吉比特&雷霆游戏--2020春招实习
    Docker 基础知识
  • 原文地址:https://www.cnblogs.com/yangjiguang/p/9498939.html
Copyright © 2011-2022 走看看