zoukankan      html  css  js  c++  java
  • Linux-3.14.12内存管理笔记【构建内存管理框架(5)】

    前面已经分析了内存管理框架的构建实现过程,有部分内容未完全呈现出来,这里主要做个补充。

    如下图,这是前面已经看到过的linux物理内存管理框架的层次关系。

    image

    现着重分析一下各个管理结构体的成员功能作用。

    【file:/include/linux/mmzone.h】
    typedef struct pglist_data {
        struct zone node_zones[MAX_NR_ZONES];
        struct zonelist node_zonelists[MAX_ZONELISTS];
        int nr_zones;
    #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
        struct page *node_mem_map;
    #ifdef CONFIG_MEMCG
        struct page_cgroup *node_page_cgroup;
    #endif
    #endif
    #ifndef CONFIG_NO_BOOTMEM
        struct bootmem_data *bdata;
    #endif
    #ifdef CONFIG_MEMORY_HOTPLUG
        /*
         * Must be held any time you expect node_start_pfn, node_present_pages
         * or node_spanned_pages stay constant. Holding this will also
         * guarantee that any pfn_valid() stays that way.
         *
         * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
         * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
         *
         * Nests above zone->lock and zone->span_seqlock
         */
        spinlock_t node_size_lock;
    #endif
        unsigned long node_start_pfn;
        unsigned long node_present_pages; /* total number of physical pages */
        unsigned long node_spanned_pages; /* total size of physical page
                             range, including holes */
        int node_id;
        nodemask_t reclaim_nodes; /* Nodes allowed to reclaim from */
        wait_queue_head_t kswapd_wait;
        wait_queue_head_t pfmemalloc_wait;
        struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
        int kswapd_max_order;
        enum zone_type classzone_idx;
    #ifdef CONFIG_NUMA_BALANCING
        /* Lock serializing the migrate rate limiting window */
        spinlock_t numabalancing_migrate_lock;
     
        /* Rate limiting time interval */
        unsigned long numabalancing_migrate_next_window;
     
        /* Number of pages migrated during the rate limiting time interval */
        unsigned long numabalancing_migrate_nr_pages;
    #endif
    } pg_data_t;
    
    • struct zone node_zones[MAX_NR_ZONES];

    ——存放该pg_data_t里面的zone;

    • struct zonelist node_zonelists[MAX_ZONELISTS];

    ——其指向一个page结构的数组,数组中的每个成员为该节点中的一个物理页面,于是整个数组就对应了该节点中所有的物理页面;

    • struct page_cgroup *node_page_cgroup;

    ——用于管理page_cgroup,原来的page_cgroup是page页面管理结构的一个成员,现在移到这里了,它将会在初始化时所有的page_cgroup都将申请下来;

    • struct bootmem_data *bdata;

    ——该数据指向bootmem_node_data,可以通过system.map查到。原是用于bootmem内存分配器的信息存储,当前改用memblock算法,则不存在该成员;

    • unsigned long node_start_pfn;

    ——指向当前pg_data_t结构管理的物理起始页面;

    • unsigned long node_present_pages;

    ——记录物理页面数总量,除开内存空洞的物理页面数;

    • unsigned long node_spanned_pages;

    ——最大和最小页面号的差值,包括内存空洞的总的物理页面大小;

    • int node_id;

    ——pg_data_t对应的索引号,非NUMA架构下该值为0;

    • nodemask_t reclaim_nodes;

    ——用于记录可回收的内存管理节点node信息;

    • wait_queue_head_t kswapd_wait;

    ——kswapd是页面交换守护线程,该线程会阻塞在这个等待队列,当满足条件后,调用wake_up_interruptible()唤醒该队列进行相关操作;

    • wait_queue_head_t pfmemalloc_wait;

    ——用于减缓内存直接回收;

    • struct task_struct *kswapd;

    ——指向kswapd守护线程的任务指针;

    • int kswapd_max_order;

    ——用于表示kswapd守护线程每次回收的页面个数;

    • enum zone_type classzone_idx;

    ——该成员与kswapd有关;

    【file:/include/linux/mmzone.h】
    struct zone {
        /* Fields commonly accessed by the page allocator */
     
        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long watermark[NR_WMARK];
     
        /*
         * When free pages are below this point, additional steps are taken
         * when reading the number of free pages to avoid per-cpu counter
         * drift allowing watermarks to be breached
         */
        unsigned long percpu_drift_mark;
     
        /*
         * We don't know if the memory that we're going to allocate will be freeable
         * or/and it will be released eventually, so to avoid totally wasting several
         * GB of ram we must reserve some of the lower zone memory (otherwise we risk
         * to run OOM on the lower zones despite there's tons of freeable ram
         * on the higher zones). This array is recalculated at runtime if the
         * sysctl_lowmem_reserve_ratio sysctl changes.
         */
        unsigned long lowmem_reserve[MAX_NR_ZONES];
     
        /*
         * This is a per-zone reserve of pages that should not be
         * considered dirtyable memory.
         */
        unsigned long dirty_balance_reserve;
     
    #ifdef CONFIG_NUMA
        int node;
        /*
         * zone reclaim becomes active if more unmapped pages exist.
         */
        unsigned long min_unmapped_pages;
        unsigned long min_slab_pages;
    #endif
        struct per_cpu_pageset __percpu *pageset;
        /*
         * free areas of different sizes
         */
        spinlock_t lock;
    #if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* Set to true when the PG_migrate_skip bits should be cleared */
        bool compact_blockskip_flush;
     
        /* pfns where compaction scanners should start */
        unsigned long compact_cached_free_pfn;
        unsigned long compact_cached_migrate_pfn;
    #endif
    #ifdef CONFIG_MEMORY_HOTPLUG
        /* see spanned/present_pages for more description */
        seqlock_t span_seqlock;
    #endif
        struct free_area free_area[MAX_ORDER];
     
    #ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
         * In SPARSEMEM, this map is stored in struct mem_section
         */
        unsigned long *pageblock_flags;
    #endif /* CONFIG_SPARSEMEM */
     
    #ifdef CONFIG_COMPACTION
        /*
         * On compaction failure, 1<<compact_defer_shift compactions
         * are skipped before trying again. The number attempted since
         * last failure is tracked with compact_considered.
         */
        unsigned int compact_considered;
        unsigned int compact_defer_shift;
        int compact_order_failed;
    #endif
     
        ZONE_PADDING(_pad1_)
     
        /* Fields commonly accessed by the page reclaim scanner */
        spinlock_t lru_lock;
        struct lruvec lruvec;
     
        unsigned long pages_scanned; /* since last reclaim */
        unsigned long flags; /* zone flags, see below */
     
        /* Zone statistics */
        atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
     
        /*
         * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
         * this zone's LRU. Maintained by the pageout code.
         */
        unsigned int inactive_ratio;
     
     
        ZONE_PADDING(_pad2_)
        /* Rarely used or read-mostly fields */
     
        /*
         * wait_table -- the array holding the hash table
         * wait_table_hash_nr_entries -- the size of the hash table array
         * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
         *
         * The purpose of all these is to keep track of the people
         * waiting for a page to become available and make them
         * runnable again when possible. The trouble is that this
         * consumes a lot of space, especially when so few things
         * wait on pages at a given time. So instead of using
         * per-page waitqueues, we use a waitqueue hash table.
         *
         * The bucket discipline is to sleep on the same queue when
         * colliding and wake all in that wait queue when removing.
         * When something wakes, it must check to be sure its page is
         * truly available, a la thundering herd. The cost of a
         * collision is great, but given the expected load of the
         * table, they should be so rare as to be outweighed by the
         * benefits from the saved space.
         *
         * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
         * primary users of these fields, and in mm/page_alloc.c
         * free_area_init_core() performs the initialization of them.
         */
        wait_queue_head_t * wait_table;
        unsigned long wait_table_hash_nr_entries;
        unsigned long wait_table_bits;
     
        /*
         * Discontig memory support fields.
         */
        struct pglist_data *zone_pgdat;
        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long zone_start_pfn;
     
        /*
         * spanned_pages is the total pages spanned by the zone, including
         * holes, which is calculated as:
         * spanned_pages = zone_end_pfn - zone_start_pfn;
         *
         * present_pages is physical pages existing within the zone, which
         * is calculated as:
         * present_pages = spanned_pages - absent_pages(pages in holes);
         *
         * managed_pages is present pages managed by the buddy system, which
         * is calculated as (reserved_pages includes pages allocated by the
         * bootmem allocator):
         * managed_pages = present_pages - reserved_pages;
         *
         * So present_pages may be used by memory hotplug or memory power
         * management logic to figure out unmanaged pages by checking
         * (present_pages - managed_pages). And managed_pages should be used
         * by page allocator and vm scanner to calculate all kinds of watermarks
         * and thresholds.
         *
         * Locking rules:
         *
         * zone_start_pfn and spanned_pages are protected by span_seqlock.
         * It is a seqlock because it has to be read outside of zone->lock,
         * and it is done in the main allocator path. But, it is written
         * quite infrequently.
         *
         * The span_seq lock is declared along with zone->lock because it is
         * frequently read in proximity to zone->lock. It's good to
         * give them a chance of being in the same cacheline.
         *
         * Write access to present_pages at runtime should be protected by
         * lock_memory_hotplug()/unlock_memory_hotplug(). Any reader who can't
         * tolerant drift of present_pages should hold memory hotplug lock to
         * get a stable value.
         *
         * Read access to managed_pages should be safe because it's unsigned
         * long. Write access to zone->managed_pages and totalram_pages are
         * protected by managed_page_count_lock at runtime. Idealy only
         * adjust_managed_page_count() should be used instead of directly
         * touching zone->managed_pages and totalram_pages.
         */
        unsigned long spanned_pages;
        unsigned long present_pages;
        unsigned long managed_pages;
     
        /*
         * Number of MIGRATE_RESEVE page block. To maintain for just
         * optimization. Protected by zone->lock.
         */
        int nr_migrate_reserve_block;
     
        /*
         * rarely used fields:
         */
        const char *name;
    } ____cacheline_internodealigned_in_smp;
    
    • unsigned long watermark[NR_WMARK];

    ——该数组有三个值WMARK_MIN、WMARK_LOW、WMARK_HIGH,如命名所标识,min最小,low居中,high最大。内存分配过程中,当空闲页面达到low时,内存分配器会唤醒kswapd守护进程来回收物理页面;当空闲页面达到min时,内存分配器就会唤醒kswapd以同步方式回收;如果kswapd被唤醒后,空闲页面达到high时,则会使kswapd再次休眠;

    • unsigned long percpu_drift_mark;

    ——当空闲页面低于该值,将会引发附加操作的执行,用于避免前面的watermark被冲破;

    • unsigned long lowmem_reserve[MAX_NR_ZONES];

    ——记录每个管理区中必须保留的物理页面数,以用于紧急状况下的内存分配;

    • unsigned long dirty_balance_reserve;

    ——用于表示不会被内存分配器分配出去的空闲页面部分的近似值;

    • struct per_cpu_pageset __percpu *pageset;

    ——该数组里面的成员pcp用于实现冷热页面的管理;

    • spinlock_t lock;

    ——spinlock锁,用于解决该管理区的并发问题;

    • struct free_area free_area[MAX_ORDER];

    ——主要用于Buddy内存管理算法(伙伴算法);

    • unsigned long *pageblock_flags;

    ——与伙伴算法的碎片迁移算法有关;

    • spinlock_t lru_lock;

    ——用于保护lruvec结构数据;

    • struct lruvec lruvec;

    ——lruvec该数组里面有一个lists是用于lru管理的链表,另外有一个reclaim_stat用于页面回收的状态标示;

    • unsigned long pages_scanned;

    ——用于记录上次物理页面回收时,扫描过的页描述符总数;

    • unsigned long flags;

    ——用于表示当前内存管理区的状态;

    • atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];

    ——用于统计该内存管理区中各项状态的数值;

    • unsigned int inactive_ratio;

    ——不活跃的页面比例;

    • wait_queue_head_t *wait_table;

    • unsigned long wait_table_hash_nr_entries;

    • unsigned long wait_table_bits;

    • struct pglist_data *zone_pgdat;

    ——指向该内存管理区的pg_data_list;

    • unsigned long zone_start_pfn;

    ——记录当前内存管理区中最小的物理页面号;

    • unsigned long spanned_pages;

    ——记录内存管理区的总页面数,包括内存空洞的页面数,实则上是管理区末尾页面号和起始页面号的差值;

    • unsigned long present_pages;

    ——除去内存空洞后的内存管理区实际有效的总页面数;

    • unsigned long managed_pages;

    ——用于记录被内存管理算法管理的物理页面数,这是除去了在初始化阶段被申请的页面;

    • int nr_migrate_reserve_block;

    ——用于优化的,记录内存迁移保留的页面数;

    • const char *name;

    ——用于记录该管理区的名字;

    【file:/include/linux/mmzone.h】
    /*
     * Each physical page in the system has a struct page associated with
     * it to keep track of whatever it is we are using the page for at the
     * moment. Note that we have no way to track which tasks are using
     * a page, though if it is a pagecache page, rmap structures can tell us
     * who is mapping it.
     *
     * The objects in struct page are organized in double word blocks in
     * order to allows us to use atomic double word operations on portions
     * of struct page. That is currently only used by slub but the arrangement
     * allows the use of atomic double word operations on the flags/mapping
     * and lru list pointers also.
     */
    struct page {
        /* First double word block */
        unsigned long flags; /* Atomic flags, some possibly
                         * updated asynchronously */
        union {
            struct address_space *mapping; /* If low bit clear, points to
                             * inode address_space, or NULL.
                             * If page mapped as anonymous
                             * memory, low bit is set, and
                             * it points to anon_vma object:
                             * see PAGE_MAPPING_ANON below.
                             */
            void *s_mem; /* slab first object */
        };
     
        /* Second double word */
        struct {
            union {
                pgoff_t index; /* Our offset within mapping. */
                void *freelist; /* sl[aou]b first free object */
                bool pfmemalloc; /* If set by the page allocator,
                             * ALLOC_NO_WATERMARKS was set
                             * and the low watermark was not
                             * met implying that the system
                             * is under some pressure. The
                             * caller should try ensure
                             * this page is only used to
                             * free other pages.
                             */
            };
     
            union {
    #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && 
        defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
                /* Used for cmpxchg_double in slub */
                unsigned long counters;
    #else
                /*
                 * Keep _count separate from slub cmpxchg_double data.
                 * As the rest of the double word is protected by
                 * slab_lock but _count is not.
                 */
                unsigned counters;
    #endif
     
                struct {
     
                    union {
                        /*
                         * Count of ptes mapped in
                         * mms, to show when page is
                         * mapped & limit reverse map
                         * searches.
                         *
                         * Used also for tail pages
                         * refcounting instead of
                         * _count. Tail pages cannot
                         * be mapped and keeping the
                         * tail page _count zero at
                         * all times guarantees
                         * get_page_unless_zero() will
                         * never succeed on tail
                         * pages.
                         */
                        atomic_t _mapcount;
     
                        struct { /* SLUB */
                            unsigned inuse:16;
                            unsigned objects:15;
                            unsigned frozen:1;
                        };
                        int units; /* SLOB */
                    };
                    atomic_t _count; /* Usage count, see below. */
                };
                unsigned int active; /* SLAB */
            };
        };
     
        /* Third double word block */
        union {
            struct list_head lru; /* Pageout list, eg. active_list
                         * protected by zone->lru_lock !
                         */
            struct { /* slub per cpu partial pages */
                struct page *next; /* Next partial slab */
    #ifdef CONFIG_64BIT
                int pages; /* Nr of partial slabs left */
                int pobjects; /* Approximate # of objects */
    #else
                short int pages;
                short int pobjects;
    #endif
            };
     
            struct list_head list; /* slobs list of pages */
            struct slab *slab_page; /* slab fields */
            struct rcu_head rcu_head; /* Used by SLAB
                             * when destroying via RCU
                             */
    #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
            pgtable_t pmd_huge_pte; /* protected by page->ptl */
    #endif
        };
     
        /* Remainder is not double word aligned */
        union {
            unsigned long private; /* Mapping-private opaque data:
                             * usually used for buffer_heads
                             * if PagePrivate set; used for
                             * swp_entry_t if PageSwapCache;
                             * indicates order in the buddy
                             * system if PG_buddy is set.
                             */
    #if USE_SPLIT_PTE_PTLOCKS
    #if ALLOC_SPLIT_PTLOCKS
            spinlock_t *ptl;
    #else
            spinlock_t ptl;
    #endif
    #endif
            struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */
            struct page *first_page; /* Compound tail pages */
        };
     
        /*
         * On machines where all RAM is mapped into kernel address space,
         * we can simply calculate the virtual address. On machines with
         * highmem some memory is mapped into kernel virtual memory
         * dynamically, so we need a place to store that address.
         * Note that this field could be 16 bits on x86 ... ;)
         *
         * Architectures with slow multiplication can define
         * WANT_PAGE_VIRTUAL in asm/page.h
         */
    #if defined(WANT_PAGE_VIRTUAL)
        void *virtual; /* Kernel virtual address (NULL if
                           not kmapped, ie. highmem) */
    #endif /* WANT_PAGE_VIRTUAL */
    #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
        unsigned long debug_flags; /* Use atomic bitops on this */
    #endif
     
    #ifdef CONFIG_KMEMCHECK
        /*
         * kmemcheck wants to track the status of each byte in a page; this
         * is a pointer to such a status block. NULL if not tracked.
         */
        void *shadow;
    #endif
     
    #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
        int _last_cpupid;
    #endif
    }
    

    (该结构很多union结构,主要是用于各种算法不同数据的空间复用,暂时记录部分常见的数据成员)

    • unsigned long flags;

    ——用于记录页框的类型;

    • struct address_space *mapping;

    ——用于区分该页是映射页框还是匿名页框;

    • atomic_t _mapcount;

    ——记录了系统中页表有多少项指向该页;

    • atomic_t _count;

    ——当前系统对该页面的引用次数;

    • struct list_head lru;

    ——当页框处于分配状态时,该成员用于zone的lruvec里面的list,当页框未被分配时则用于伙伴算法;

    • unsigned long private;

    ——指向“私有”数据的指针。根据页的用途,可以用不同的方式使用该指针,通常用于与数据缓冲区关联起来;

    • void *virtual;

    ——用于高端内存区域的页,即用于无法直接映射的页,该成员用于存储该页的虚拟地址;

  • 相关阅读:
    【全网最全的博客美化系列教程】文章总目录
    不要再被骗了------QQ盗号原理大揭秘
    努力的孩子运气不会太差,跌宕的人生定当更加精彩
    我的七条人生哲理以及个人学习方法总结
    博客园自定义页面风格设计 后续篇(页面设计模式及代码高亮 鼠标点击效果升级)
    【资料分享】500篇干货解读人工智能新时代
    我的大一生活以及在博客园写博客的这一年
    博客园自定义页面风格设计
    ACM退役记&&回忆录
    留言板
  • 原文地址:https://www.cnblogs.com/linhaostudy/p/11628887.html
Copyright © 2011-2022 走看看