内核对一致和非一致内存访问系统使用相同的数据结构。在UMA系统上,只使用一个NUMA结点来管理整个系统内存。而内存管理的其他部分则相信他们是在处理一个伪NUMA系统。
3.2.1 概述
内存划分为结点。每个结点关联系统中的一个处理器,在内核中,使用pg_data_t的实例。
各个结点又划分为内存域,是内存的进一步细分。分为:normal、DMA、highmem 最多3个内存域。
/* ./include/linux/memzon.h */ enum zone_type { #ifdef CONFIG_ZONE_DMA /* * ZONE_DMA is used when there are devices that are not able * to do DMA to all of addressable memory (ZONE_NORMAL). Then we * carve out the portion of memory that is needed for these devices. * The range is arch specific. * * Some examples * * Architecture Limit * --------------------------- * parisc, ia64, sparc <4G * s390 <2G * arm Various * alpha Unlimited or 0-16MB. * * i386, x86_64 and multiple other arches * <16M. */ ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 /* * x86_64 needs two ZONE_DMAs because it supports devices that are * only able to do DMA to the lower 16M but also 32 bit devices that * can only do DMA areas below 4G. */ ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, /* 无法保证改地址范围对应实际的物理内存,例如:在AMD64位系统有2GB内存,那么所有内存都属于ZONE_DMA32范围,而ZONE_NORMAL则为空 */ #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif ZONE_MOVABLE, // 伪内存区域,在防止物理内存碎片的机制中需要使用 __MAX_NR_ZONES // 结束标记,在迭代系统所有内存域时,会常用该常量 };
各个内存域都关联了一个数组,用来组织属于该内存域的物理页(内核中称之为页帧)。对于每个页帧,都分配一个struct page实例及所需的管理数据。
各个内存结点保存在一个单链表中,共内核遍历。
出于性能的考虑,在为进程分配内存时,内核总是试图在当前运行CPU相关联的NUMA结点上进行。但这并不总是可行的,例如,该结点的内存已经用尽。对此情况,每个结点都提供一个备用链表(借助于struct zonelist)。改列表包含了其他结点和相关的内存域,可用于代替当前结点分配内存。列表项的位置越靠后,就越不适合分配。
3.2.2 数据结构
1. 结点管理
pg_data_t用于表示结点的基本元素,定义如下:
/* ./include/linux/mmzone.h */ /* * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM * (mostly NUMA machines?) to denote a higher-level memory zone than the * zone denotes. * * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ struct bootmem_data; typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; // 属于该内存结点的各个内存域的数组 struct zonelist node_zonelists[MAX_ZONELISTS]; // 备用结点及其内存域的列表,以便在当前结点没有可用空间时,在备用结点分配内存 int nr_zones; // 该内存结点不同内存域的数目 #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; // 指向page实例的数组指针,用于描述结点的所有物理内存页 #ifdef CONFIG_CGROUP_MEM_RES_CTLR struct page_cgroup *node_page_cgroup; #endif #endif #ifndef CONFIG_NO_BOOTMEM struct bootmem_data *bdata; // 在系统启动期间,内存管理子系统初始化之前,系统所需内存分配,即自举内存分配器数据结构的实例 #endif #ifdef CONFIG_MEMORY_HOTPLUG /* * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * * Nests above zone->lock and zone->size_seqlock. */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; // 第一个页帧的逻辑编号,系统中所有结点的页帧是依次编号的,每个页帧的号码是全局唯一的。在UMA系统中总是0,因为只有一个结点。 unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; // 全局结点ID,系统中的NUMA结点都是从0开始编号 wait_queue_head_t kswapd_wait; // 交换守护进程(swap daemon)的等待队列,在将页帧换出结点时会用到。 struct task_struct *kswapd; // 负责该结点的交换守护进程的task_struct int kswapd_max_order; // 用于交换子系统的实现,用来定义需要释放的区域的长度 enum zone_type classzone_idx; } pg_data_t;
- 结点的状态管理
如果系统中结点多于一个,内核会维护一个位图,泳衣提供各个结点的状态信息。状态是用位掩码指定的,可使用下列值:
/* include/nodemask.h */ /* * Bitmasks that are kept for all the nodes. */ enum node_states { N_POSSIBLE, /* The node could become online at some point */ N_ONLINE, /* The node is online */ N_NORMAL_MEMORY, /* The node has regular memory */ #ifdef CONFIG_HIGHMEM N_HIGH_MEMORY, /* The node has regular or high memory */ #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif N_CPU, /* The node has one or more cpus */ NR_NODE_STATES };
状态N_POSSIBLE、N_ONLINE和N_NORMAL_MEMORY用于CPU和内存的热拔插。如果该结点有普通或高端内存则使用N_HIGH_MEMORY和N_NORMAL_MEMORY,仅当结点没有高端内存才设置N_NORMAL_MEMORY。
/* 两个辅助函数用来设置或清除位域或特定结点中的一个bit */ void node_set_state(int node, enum node_states state); void node_clear_state(int node, enum node_states state); /* * 宏 for_each_node_state(__node, __state):用来迭代处于特定状态的所有结点 * 宏 for_each_online_node(node):迭代所有的活动结点 */
如果内核编译为只支持单个结点(UMA系统),没有结点位图,上述操作该位图的函数则变为空操作。
2. 内存域
struct zone { /* Fields commonly accessed by the page allocator(由页分配器访问的字段) */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long watermark[NR_WMARK]; /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several * GB of ram we must reserve some of the lower zone memory (otherwise we risk * to run OOM on the lower zones despite there's tons of freeable ram * on the higher zones). This array is recalculated at runtime if the * sysctl_lowmem_reserve_ratio sysctl changes. */ unsigned long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; /* * zone reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif struct per_cpu_pageset __percpu *pageset; /* * free areas of different sizes */ spinlock_t lock; int all_unreclaimable; /* All pages pinned */ #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif struct free_area free_area[MAX_ORDER]; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. */ unsigned int compact_considered; unsigned int compact_defer_shift; #endif ZONE_PADDING(_pad1_) /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; struct zone_lru { struct list_head list; } lru[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; /* * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on * this zone's LRU. Maintained by the pageout code. */ unsigned int inactive_ratio; ZONE_PADDING(_pad2_) /* Rarely used or read-mostly fields */ /* * wait_table -- the array holding the hash table * wait_table_hash_nr_entries -- the size of the hash table array * wait_table_bits -- wait_table_size == (1 << wait_table_bits) * * The purpose of all these is to keep track of the people * waiting for a page to become available and make them * runnable again when possible. The trouble is that this * consumes a lot of space, especially when so few things * wait on pages at a given time. So instead of using * per-page waitqueues, we use a waitqueue hash table. * * The bucket discipline is to sleep on the same queue when * colliding and wake all in that wait queue when removing. * When something wakes, it must check to be sure its page is * truly available, a la thundering herd. The cost of a * collision is great, but given the expected load of the * table, they should be so rare as to be outweighed by the * benefits from the saved space. * * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the * primary users of these fields, and in mm/page_alloc.c * free_area_init_core() performs the initialization of them. */ wait_queue_head_t * wait_table; unsigned long wait_table_hash_nr_entries; unsigned long wait_table_bits; /* * Discontig memory support fields. */ struct pglist_data *zone_pgdat; /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * zone_start_pfn, spanned_pages and present_pages are all * protected by span_seqlock. It is a seqlock because it has * to be read outside of zone->lock, and it is done in the main * allocator path. But, it is written quite infrequently. * * The lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ /* * rarely used fields: */ const char *name; } ____cacheline_internodealigned_in_smp;
- 由ZONE_PADDING分割为几个部分。因为对zone结构的访问非常频繁,在SMP上,通常会有不同的CPU试图同时访问该结构成员。因此使用锁防止它们彼此干扰。在内核访问时,会获取两个自旋锁zone->lock和zone->lru_lock。如果数据保存在CPU高速缓存中,那么会处理的更加快速。高速缓存分为行,每一行负责不同的内存区(cache line)。内核使用ZONE_PADDING宏生成“填充”字段添加到结构中,以确保每个自旋锁都出于自身的缓存行中。而且使用了编译器关键字__cacheline_maxaligned_in_smp,泳衣实现最优的告诉缓存对其方式。
- unsigned long watermark[NR_WMARK];,其中NR_WATER为WMARK_MIN,WMARK_LOW,WMARK_HIGH。这是页换出时使用的水印。如果内存不足时,内核可以将页写到硬盘。这3个成员会影响守护进程的行为。
- 如果空闲页多于WATER_HIGH,内存域处于理想状态。
- 如果空闲页低于WATER_LOW,内核开始将页换出硬盘。
- 如果空闲页低于WATER_MIN,那么页回收工作压力就比较大,因为内存域中急需空闲页
- lowmem_reserve数组分别指定了各个内存域无论如何都不能失败的关键性内存分配。各个内存域的份额根据重要性决定。
- pageset是一个数组,用于实现每个CPU的冷/热页帧列表。内核使用这些列表来保存可用于满足实现的“新鲜页”。但冷热页帧的高速缓存状态不同:有些页帧也很可能仍然在高速缓存中,因此可以快速访问,故称之为热的;未缓存的页帧与此相对,故为冷的。
- free_area用于实现buddy系统。每个元素表示某种固定长度的一些连续内存区。对于包含在每个内存区中的空闲页的管理,free_area是一个起点。
- pages_scanned指定了上次换出一页以来,有多少页未能成功扫描。
- flags内存域当前状态。
typedef enum { ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim 【防止并发回收】*/ ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ ZONE_CONGESTED, /* zone has many dirty pages backed by a congested BDI*/ } zone_flags_t;
- vm_stat 维护了大量有关该内存域的统计信息。
- wait_table、wait_table_bits和wait_table_hash_nr_entries实现了一个等待队列,用于等待某一个页变为可用进程。进程排成一个队列,等待某些条件。在条件变为真时,内核会通知进程恢复工作。
- zone_start_pfn内存域第一个页帧的索引。
3. 内存域水印的计算
在计算各种水印之前,内核首先去顶需要为关键性分配保留的内存空间的最小值。该值随可用内存的大小而非线性增长,并保存在全局变量min_free_kbytes中。如下图:大图横轴采用对数坐标,小图横轴采用普通坐标,小图放大了总内存容量在0~4GB之间的变化曲线。总之:不能少于128KB,不能多于64MB【或许现在已经超了】。用户可以通过文件/proc/sys/vm/min_free_kbytes来读取和修改该设置。
/* ./mm/page_alloc.c */ /* * Initialise min_free_kbytes. * * For small machines we want it small (128k min). For large machines * we want it large (64MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: * min_free_kbytes = sqrt(lowmem_kbytes * 16) * * which yields * * 16MB: 512k * 32MB: 724k * 64MB: 1024k * 128MB: 1448k * 256MB: 2048k * 512MB: 2896k * 1024MB: 4096k * 2048MB: 5792k * 4096MB: 8192k * 8192MB: 11584k * 16384MB: 16384k */ int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); min_free_kbytes = int_sqrt(lowmem_kbytes * 16); if (min_free_kbytes < 128) min_free_kbytes = 128; if (min_free_kbytes > 65536) min_free_kbytes = 65536; setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; } module_init(init_per_zone_wmark_min)
/** * setup_per_zone_wmarks - called when min_free_kbytes changes * or when memory is hot-{added|removed} * * Ensures that the watermark[min,low,high] values for each zone are set * correctly with respect to min_free_kbytes. */ void setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; /* Calculate total number of !ZONE_HIGHMEM pages */ for_each_zone(zone) { if (!is_highmem(zone)) lowmem_pages += zone->present_pages; } for_each_zone(zone) { u64 tmp; spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone->present_pages; do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't * need highmem pages, so cap pages_min to a small * value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas controls asynch page reclaim, and so should * not be capped for highmem. */ int min_pages; min_pages = zone->present_pages / 1024; if (min_pages < SWAP_CLUSTER_MAX) min_pages = SWAP_CLUSTER_MAX; if (min_pages > 128) min_pages = 128; zone->watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ zone->watermark[WMARK_MIN] = tmp; } zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } /* update totalreserve_pages */ calculate_totalreserve_pages(); }
5. 冷热页
热页:该页已经加载到CPU高速缓存中。冷页反之。
struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; #endif #ifdef CONFIG_SMP s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif };
5. 页帧
页帧代表系统内存的最小单位,对内存中每个页都会创建struct page一个实例。所以需要该结构尽量小,不然假如一个页4K,但是描述它的结构却占了2K,那岂不是很浪费?!内存管理的许多部分都是用页,用于各种不同的用途。那么问题来了,page为某个功能提供的信息,但是其他部分可能完全无用。所以C语言的union适合该问题。
例如:一个物理内存页能够通过多个地方的不同页表映射到虚拟地址空间,内核想要跟踪有多少个地方映射了该页。为此,struct page中有一个计数器用于计算映射的数目。如果一页用于slab分配器,那么可以确保只有内核会使用该页,而不会有其他地方使用,因此映射技术信息就是多余的。因此内核可以重新解释该字段,用来表示该页被细分为多少个小的内存对象使用。
struct page { ...... union { atomic_t _mapcount; /* Count of ptes mapped in mms, * to show when page is mapped * & limit reverse map searches. */ struct { /* SLUB */ u16 inuse; /* slub分配器:对象的数目 */ u16 objects; }; }; ...... };
/* include/linux/mm_types.h */ /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us * who is mapping it. */ struct page { unsigned long flags; /* Atomic flags, some possibly updated asynchronously */ atomic_t _count; /* Usage count, see below. */ union { atomic_t _mapcount; /* Count of ptes mapped in mms, * to show when page is mapped * & limit reverse map searches. */ struct { /* SLUB */ u16 inuse; u16 objects; }; }; union { struct { unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache; * indicates order in the buddy * system if PG_buddy is set. */ struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and * it points to anon_vma object: * see PAGE_MAPPING_ANON below. */ }; #if USE_SPLIT_PTLOCKS spinlock_t ptl; #endif struct kmem_cache *slab; /* SLUB: Pointer to slab */ struct page *first_page; /* Compound tail pages */ }; union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* SLUB: freelist req. slab lock */ }; struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! */ /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS unsigned long debug_flags; /* Use atomic bitops on this */ #endif #ifdef CONFIG_KMEMCHECK /* * kmemcheck wants to track the status of each byte in a page; this * is a pointer to such a status block. NULL if not tracked. */ void *shadow; #endif };
该结构与体系结构无关,不依赖于CPU的类型。
- flags:与体系结构无关的标志。用于描述页的属性。
- _count:引用计数。表示内核中引用该页的次数。在其值为0时,内核知道page实例当前不适用,因此可以删除。
- _mapcount:页表中有多少项指向该页。
- lru:一个链表头。用于在各种连表上维护该页,以便将页按不同的类别分类,最重要的类别是活动页和不活动页。
- 内核可以将毗邻的页合并为较大的复合页(compound page)。分组中的第一个页称为首页(head page),而所有其余各页叫做尾页(tail page)。所有尾页对应的page实例中,都将first_page设置为指向首页。
- mapping指定了页帧所在地址空间。index是页帧在映射内部的偏移量。地址空间是一个非常一般的概念,例如:可以用在向内存读取文件时,地址空间可以将文件的内容与装在数据的内存关联起来。通过一个小技巧,mapping不仅能够保存一个指针,还能包含一些额外的信息,用于判断页是否属于未关联到地址空间的某个匿名内存区。如果将mapping设置为1,则该指针并不指向address_space的实例,而是指向另一个数据结构(anon_vma),该结构对实现匿名页的逆向映射很重要。对该指针的双重使用是可能的,因为address_space实例总是对齐到sizeof(long).因此在Linux支持的所有计算机上,指向该实例的指针最低位总是0.
该指针如果指向address_space实例,则可以直接使用。如果使用了技巧将最低位设置为1,内核可使用以下操作恢复来恢复指针:
anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
- private指向”私有数据“。虚拟内存管理会忽略该数据。根据页的用途,可以用不同的方式使用该指针。大多数情况下它用于将页与数据缓冲区关联起来。
- virtual:用于高端内存区中的页。virtual用于存储该页的虚拟地址。
WANT_PAGE_VIRTUAL:只用几个体系结构定义该宏:摩托罗拉m68k、FRV和Extensa。
而其他体系结构都采用了一种不同的方案来寻址虚拟内存页。其核心是用来查找所有的高端内存页帧的散列表。