节点
NUMA(Non-uniform memory access),非一致内存访问。在这种模式下,内存不是一整块。每个 CPU 都有自己的本地内存,CPU 访问本地内存不用过总线,因而速度要快很多,每个 CPU 和内存在一起,称为一个 NUMA 节点。但是,在本地内存不足的情况下,每个 CPU 都可以去另外的 NUMA 节点申请内存,这个时候访问延时就会比较长。
我们主要解析当前的主流场景,NUMA 方式。我们首先要能够表示 NUMA 节点的概念,于是有了typedef struct pglist_data pg_data_t。
每个节点分成一个个区域 zone,放在数组 node_zones 里面。这个数组的大小为 MAX_NR_ZONES。这里对于区域的划分,都是针对物理内存的。
DMA
要把外设的数据读入内存或把内存的数据传送到外设,原来都要通过 CPU 控制完成,但是这会占用 CPU,影响 CPU 处理其他事情,所以有了 DMA 模式。CPU 只需向 DMA 控制器下达指令,让 DMA 控制器来处理数据的传送,数据传送完毕再把信息反馈给 CPU,这样就可以解放 CPU。
typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[MAX_ZONELISTS]; // 备用节点和它的内存区域的情况 int nr_zones; // 当前节点区域的数量 struct page *node_mem_map; //这个节点的 struct page 数组,用于描述这个节点里面的所有的页 unsigned long node_start_pfn; // 这个节点的起始页号 unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; //每一个节点都有自己的 ID ...... } pg_data_t; // 整个内存被分成了多个节点,pglist_data放在一个数组里面,每个节点一项 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; enum zone_type { #ifdef CONFIG_ZONE_DMA ZONE_DMA, // 指可用于作 DMA(Direct Memory Access,直接内存存取)的内存 #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, // 对于 64 位系统,有两个 DMA 区域:ZONE_DMA,ZONE_DMA32 #endif ZONE_NORMAL, // 直接映射区,从物理内存到虚拟内存的内核区域,通过加上一个常量直接映射 #ifdef CONFIG_HIGHMEM ZONE_HIGHMEM, // 高端内存区,对于 32 位系统来说超过 896M 的地方,对于 64 位没必要有的一段区域 #endif ZONE_MOVABLE, // 可移动区域,通过将物理内存划分为可移动分配区域和不可移动分配区域来避免内存碎片 __MAX_NR_ZONES };
例如,64M 物理内存隔着一个 4M 的空洞,然后是另外的 64M 物理内存。这样换算成页面数目就是,16K 个页面隔着 1K 个页面,然后是另外 16K 个页面。这种情况下,node_spanned_pages 就是 33K 个页面,node_present_pages 就是 32K 个页面。
// mm/page_alloc.c static char * const zone_names[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA "DMA", #endif #ifdef CONFIG_ZONE_DMA32 "DMA32", #endif "Normal", #ifdef CONFIG_HIGHMEM "HighMem", #endif "Movable", #ifdef CONFIG_ZONE_DEVICE "Device", #endif }; /* * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ static int fallbacks[MIGRATE_TYPES][3] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, #ifdef CONFIG_CMA [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ #endif #ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ #endif }; // include/linux/mmzone.h struct zone { ...... /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; ...... } struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. What is important though * is that a range of pageblocks must be aligned to * MAX_ORDER_NR_PAGES should biggest page be bigger than * a single pageblock. */ MIGRATE_CMA, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES };
zone_type
// include/linux/mmzone.h enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Therefore, we do not allow * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and * faulted, they come from the right zone right away. However, it is * still possible that address space already has pages in * ZONE_MOVABLE at the time when pages are pinned (i.e. user has * touches that memory before pinning). In such case we migrate them * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. * 7. Memory-hotplug: when using memmap_on_memory and onlining the * memory to the MOVABLE zone, the vmemmap pages are also placed in * such zone. Such pages cannot be really moved around as they are * self-stored in the range, but they are treated as movable when * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES };
区域
到这里,我们把内存分成了节点,把节点分成了区域。表示区域的数据结构 struct zone 的定义如下:
struct zone { ...... struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; unsigned long zone_start_pfn; // 该zone的起始页 /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; */ unsigned long managed_pages; unsigned long spanned_pages; unsigned long present_pages; const char *name; ...... /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; ...... } ____cacheline_internodealigned_in_
per_cpu_pageset 用于区分冷热页。
什么叫冷热页呢?如果一个页被加载到 CPU 高速缓存里面,这就是一个热页(Hot Page),CPU 读起来速度会快很多,如果没有就是冷页(Cold Page)。
由于每个 CPU 都有自己的高速缓存,因而 per_cpu_pageset 也是每个 CPU 一个。
页
了解了区域 zone,接下来我们就到了组成物理内存的基本单位,页的数据结构 struct page。
这是一个特别复杂的结构,里面有很多的 union,union 结构是在 C 语言中被用于同一块内存根据情况保存不同类型数据的一种方式。这里之所以用了 union,是因为一个物理页面使用模式有多种。
第一种模式,要用就用一整页。
这一整页的内存,或者直接和虚拟地址空间建立映射关系,我们把这种称为匿名页(Anonymous Page)。
或者用于关联一个文件,然后再和虚拟地址空间建立映射关系,这样的文件,我们称为内存映射文件(Memory-mapped File)。
第二种模式,仅需分配小块内存。
有时候,我们不需要一下子分配这么多的内存,例如分配一个 task_struct 结构,只需要分配小块的内存,去存储这个进程描述结构的对象。为了满足对这种小内存块的需要,Linux 系统采用了一种被称为 slab allocator 的技术,用于分配称为 slab 的一小块内存。它的基本原理是从内存管理模块申请一整块页,然后划分成多个小块的存储池,用复杂的队列来维护这些小块的状态(状态包括:被分配了 / 被放回池子 / 应该被回收)。也正是因为 slab allocator 对于队列的维护过于复杂,后来就有了一种不使用队列的分配器 slub allocator,后面我们会解析这个分配器。但是你会发现,它里面还是用了很多 slab 的字眼,因为它保留了 slab 的用户接口,可以看成 slab allocator 的另一种实现。还有一种小块内存的分配器称为 slob,非常简单,主要使用在小型的嵌入式系统。
// include/linux/mm_types.h struct page { unsigned long flags; /* Atomic flags, some possibly updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. * WARNING: bit 0 of the first word is used for PageTail(). That * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ union { struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ struct list_head lru; // 表示这一页应该在一个链表上,例如这个页面被换出,就在换出页的链表中 /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; // 用于内存映射,如果是匿名页,最低位为 1;如果是映射文件,最低位为 0 pgoff_t index; /* Our offset within mapping. 在映射区的偏移量 */ /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if PageSwapCache. * Indicates order in the buddy system if PageBuddy. */ unsigned long private; }; struct { /* page_pool used by netstack */ /** * @dma_addr: might require a 64-bit value on * 32-bit architectures. */ unsigned long dma_addr[2]; }; struct { /* slab, slob and slub */ union { struct list_head slab_list; struct { /* Partial pages */ struct page *next; #ifdef CONFIG_64BIT int pages; /* Nr of pages left */ int pobjects; /* Approximate count */ #else short int pages; short int pobjects; #endif }; }; struct kmem_cache *slab_cache; /* not slob */ /* Double-word boundary */ void *freelist; /* first free object */ union { void *s_mem; /* slab: first object 已经分配了正在使用的 slab 的第一个对象 */ unsigned long counters; /* SLUB */ struct { /* SLUB */ unsigned inuse:16; unsigned objects:15; unsigned frozen:1; }; }; }; // compound 相关的变量用于复合页(Compound Page),就是将物理上连续的两个或多个页看成一个独立的大页。 struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ /* First tail page only */ unsigned char compound_dtor; unsigned char compound_order; atomic_t compound_mapcount; unsigned int compound_nr; /* 1 << compound_order */ }; struct { /* Second tail page of compound page */ unsigned long _compound_pad_1; /* compound_head */ atomic_t hpage_pinned_refcount; /* For both global and memcg */ struct list_head deferred_list; }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ unsigned long _pt_pad_2; /* mapping */ union { struct mm_struct *pt_mm; /* x86 pgds only */ atomic_t pt_frag_refcount; /* powerpc */ }; #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; #endif }; struct { /* ZONE_DEVICE pages */ /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being * mapped so the next 3 words hold the mapping, index, * and private fields from the source anonymous or * page cache page while the page is migrated to device * private memory. * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also * use the mapping, index, and private fields when * pmem backed DAX files are mapped. */ }; /** @rcu_head: You can use this to free a page by RCU. */ struct rcu_head rcu_head; // 需要释放的列表 }; union { /* This union is 4 bytes in size. */ /* * If the page can be mapped to userspace, encodes the number * of times this page is referenced by a page table. */ atomic_t _mapcount; // 每个进程都有自己的页表,这里指有多少个页表项指向了这个页 /* * If the page is neither PageSlab nor mappable to userspace, * the value stored here may help determine what this page * is used for. See page-flags.h for a list of page types * which are currently stored here. */ unsigned int page_type; unsigned int active; /* SLAB */ int units; /* SLOB */ }; /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #endif /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif } _struct_page_alignment;
页的分配
// 在 struct zone 里面有以下的定义: struct free_area free_area[MAX_ORDER]; #define MAX_ORDER 11 static inline struct page * alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_current(gfp_mask, order); } /** * alloc_pages_current - Allocate pages. * * @gfp: 表示希望在哪个区域中分配这个内存 * %GFP_USER user allocation, 用于分配一个页映射到用户进程的虚拟地址空间,主要用于一个用户进程希望通过内存映射的方式,访问某些硬件的缓存,例如显卡缓存 * %GFP_KERNEL kernel allocation, 用于内核中分配页,主要分配 ZONE_NORMAL 区域,也即直接映射区 * %GFP_HIGHMEM highmem allocation, 主要分配高端区域的内存 * %GFP_FS don't call back into a file system. * %GFP_ATOMIC don't sleep. * @order: Power of two of allocation size in pages. 0 is a single page. * * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; ...... // 伙伴系统的核心方法 page = __alloc_pages_nodemask(gfp, order, policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); ...... return page; } static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { ...... // 在一个循环中先看当前节点的 zone,如果找不到空闲页,则再看备用节点的 zone for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { struct page *page; ...... // 找到合适大小的那个队列,把页面取下来 page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); ...... } // rmqueue->__rmqueue->__rmqueue_smallest static inline struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { unsigned int current_order; struct free_area *area; struct page *page; // 从当前的 order,在伙伴系统的 free_area 找 2^order 大小的页块。
// 如果链表的第一个不为空,就找到了;如果为空,就到更大的 order 的页块链表里面去找。 /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); page = list_first_entry_or_null(&area->free_list[migratetype], struct page, lru); if (!page) continue; list_del(&page->lru); // 将页块从链表中取下来 rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); // 把多余部分放到其他页块链表里面 set_pcppage_migratetype(page, migratetype); return page; } return NULL; } static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area, int migratetype) { unsigned long size = 1 << high; // area: 伙伴系统那个表里面的前一项,前一项里面的页块大小是当前项的页块大小除以 2,size 右移一位也就是除以 2 while (high > low) { area--; high--; size >>= 1; ...... list_add(&page[size].lru, &area->free_list[migratetype]); // 加到链表上 area->nr_free++; // 计数加 1 set_page_order(&page[size], high); } }