start_kernel ——> setup_arch ——> arch_mem_init ——> |——> bootmem_init
|——> device_tree_init
|——> sparse_init
|——> plat_swiotlb_setup
|——> paging_init
我们看看paging_init做了什么?!
void __init paging_init(void) { unsigned long max_zone_pfns[max_nr_zones]; unsigned long lastpfn __maybe_unused; int i = 0; pagetable_init(); #ifdef config_highmem kmap_init(); #endif kmap_coherent_init(); #ifdef config_zone_dma max_zone_pfns[zone_dma] = max_dma_pfn; #endif #ifdef config_zone_dma32 max_zone_pfns[zone_dma32] = max_dma32_pfn; #endif max_zone_pfns[zone_normal] = max_low_pfn; lastpfn = max_low_pfn; #ifdef config_highmem max_zone_pfns[zone_highmem] = highend_pfn; lastpfn = highend_pfn; #endif /* 上述关于页表初始化就不说了,实在是看不懂!! */
/*
* 1, max_zone_pfns 是一个数组,MAX_NR_ZONES = 3
* max_zones_pfns[0] = 131072 : 0 -> ZONE_NORMAL
* max_zones_pfns[1] = 262144 : 1 -> ZONE_HIGHMEM
* max_zones_pfns[2] = 2155610112 : 2 -> ZONE_MOVABLE
* 很明显,该数组是UMA系统内存结点的各个内存域的最大PFN.但是ZONE_MOVABLE是一个垃圾值,因为ZONE_MOVEABLE是一个虚拟内存域,而且此时该虚拟内存域的PFN还未计算。
* 还要说明的一点是 max_zone_pfns[0],其实我们系统真正的低端内存的大小是 0 - 57344,此时的 131072 是512M的区域,也就是MIPS默认512M以下都是低端内存!
* 2. free_area_init_nodes
*/ free_area_init_nodes(max_zone_pfns); }
初始化内存域和节点数据结构
回忆上篇文章,我们设置了一个数组:early_node_map,到此我们通过基于体系结构相关代码获取了如下信息:
① 系统中各个内存域的页帧边界,保存在 max_zone_pfn 数组中。
② 各结点页帧的分配情况,保存在全局变量early_node_map中。
1. 管理数据结构的创建
从内核2.6.10开始提供了一个通用的框架,用于将上述信息转换为伙伴系统预期的结点和内存域数据结构。在这以前,各个体系结构必须自行建立相关数据结构。现在体系结构只需建立简单结构,将繁重的工作交给free_area_init_nodes完成。
/* kernel/mm/page_alloc.c */ /** * free_area_init_nodes - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. * Using the page ranges provided by add_active_range(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed * that arch_max_dma32_pfn has no pages. It is also assumed that a zone * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn.
* 计算每个内存结点的内存域大小,其中的 holes 也会计算出来! */ void __init free_area_init_nodes(unsigned long *max_zone_pfn) { unsigned long nid; int i; /* Sort early_node_map as initialisation assumes it is sorted */ sort_node_map(); // 对early_node_map进行排序,后续初始化代码是认为是已经排序过的
/*
* 内核在 lib/sort.c 中提供了一个通用的堆排序实现,该函数采用了这个实现
*/ /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; /* arch_zone_lowest_possible_pfn[0] = 0; arch_zone_highest_possible_pfn[0] = 131072; */ for (i = 1; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; arch_zone_lowest_possible_pfn[i] = arch_zone_highest_possible_pfn[i-1]; arch_zone_highest_possible_pfn[i] = max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); }
/*
* arch_zone_lowest_possible_pfn[1] = 131072
* arch_zone_highest_possible_pfn[1] = 262144
*/ arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); find_zone_movable_pfns_for_nodes(zone_movable_pfn);
/*
* 由于ZONE_MOVABLE是一个虚拟内存域,不与真正的硬件内存域关联,该内存域的边界总是设置为0.内核只有在设置了内核命令参数kernelcore或movablecore之一时,该内存域才会存在。
* 该内存域一般开始于各个结点的某个特定内存域的某一页帧号!响应的编号在find_zone_movable_pfns_for_nodes中计算。
*/ /* Print out the zone ranges */ printk("Zone PFN ranges: "); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; printk(" %-8s ", zone_names[i]); if (arch_zone_lowest_possible_pfn[i] == arch_zone_highest_possible_pfn[i]) printk("empty "); else printk("%0#10lx -> %0#10lx ", arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ printk("Movable zone start PFN for each node "); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) printk(" Node %d: %lu ", i, zone_movable_pfn[i]); } /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges ", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) printk(" %3d: %0#10lx -> %0#10lx ", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn);
/*
* 打印结果:
* Zone PFN ranges:
* Normal 0x00000000 -> 0x00020000 【0 - 131071】
* HightMem 0x00020000 -> 0x00040000 【131072 - 262144】
* Movable zone start PFN for each node 【没有开启ZONE_MOVABLE】
* early_node_map[2] active PFN ranges
* 0: 0x00000000 -> 0x0000e000 【0 -> 53744】
* 1: 0x00030000 -> 0x00040000 【196608 -> 262144】
*/ /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids();
/* 遍历各个内存结点,分别调用free_area_init_node创建相关数据结构 */
for_each_online_node(nid) { // 对于UMA系统,只调用1次 pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL); /* Any memory on that node */ if (pgdat->node_present_pages) node_set_state(nid, N_HIGH_MEMORY); // 判断该结点是否有内存,如果有,就将结点位图的标志设置为 N_HIGH_MEMORY check_for_regular_memory(pgdat); // 进一步检查地域ZONE_HIGHMEM的内存域中是否有内存,并据此在结点位图中相应地设置为N_NORMAL_MEMORY标志 } }
void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); pgdat->node_id = nid; // pgdat->node_id = 0 pgdat->node_start_pfn = node_start_pfn; // pgdat->node_start_pfn = 0 calculate_node_totalpages(pgdat, zones_size, zholes_size); // 见下文 alloc_node_mem_map(pgdat); // 见下文 #ifdef CONFIG_FLAT_NODE_MEM_MAP printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx ", nid, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map);
/*
* free_area_init_node: node 0, pgdat 807874e0, node_mem_map 81000000
*/ #endif printk("%d : *zones_size = %lu, *zholes_size = %lu", *zones_size, *zholes_size); free_area_init_core(pgdat, zones_size, zholes_size); }
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, zones_size); pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, zholes_size); pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu ", pgdat->node_id, realtotalpages);
/*
* 计算内存结点的内存域信息:
* pgdat->node_spanned_pages = 262144 【包含 holes】
* pgdat->node_present_pages = 122880 【取出 holes】
*/ }
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) { /* Skip empty nodes */ if (!pgdat->node_spanned_pages) // 跳过不包含内存域的结点 return; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { // 系统中的每个物理页帧对应着一个struct page结构体,node_mem_map存储的就是start_page开始的地址 unsigned long size, start, end; struct page *map; /* * The zone's endpoints aren't required to be MAX_ORDER * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); end = pgdat->node_start_pfn + pgdat->node_spanned_pages; end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); /*
* start = 0, end = 2621144, size = 8388608 【说明在创建 struct page 实例时,包含了hole也一起创建了 struct page】
*/ map = alloc_remap(pgdat->node_id, size); // 如果特定于体系结构的代码尚未建立内存映射,返回NULL if (!map) map = alloc_bootmem_node_nopanic(pgdat, size); // 使用bootmem allocator分配器进行内存分配 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES /* * With no DISCONTIG, the global mem_map is just set as node 0's */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; #ifdef CONFIG_ARCH_POPULATES_NODE_MAP if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ }
函数 free_area_init_core对 pgdata 中相关的数据结构进行初始化设置。