关键词:warn_alloc()、__GFP_XXX、order、CMA等等。
在内存申请的时候经常会遇到类似“ xxx: page allocation failure: order:10...”类型的问题,这是warn_alloc()的输出。
warn_alloc()被如下函数调用:__alloc_pages_slowpath()、__vmalloc_area_node()、__vmalloc_node_range。
下面分三部分了解这种问题的来龙去脉:
- 什么情况会导致warn_alloc()?
- warn_alloc()都做了哪些事情?
- 结合实际问题分析问题原因。
1.触发warn_alloc()情况
要了什么情况下会导致warn_alloc(),就需要分析在何种情况下会被调用。
__alloc_pages_slowpath()表示页面申请进入了slowpath,那相对就有fastpath。
从__alloc_pages_nodemask()中可知,这个fastpath就是get_page_from_freelist()。__alloc_pages_nodemask()是分配页面的后备选择。
static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; enum compact_priority compact_priority; enum compact_result compact_result; int compaction_retries; int no_progress_loops; unsigned long alloc_start = jiffies; unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; if (order >= MAX_ORDER) { WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); return NULL; } if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; retry_cpuset: compaction_retries = 0; no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone)------------------------------------------------找不到合适的zone,进入nopage处理。 goto nopage; alloc_flags = gfp_to_alloc_flags(gfp_mask); if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) goto got_pg; if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && !gfp_pfmemalloc_allowed(gfp_mask)) {-----------------------------------------在定义__GFP_DIRECT_RECLAIM、__GFP_MEMALLOC并且order大于3,也即分配超过8页内存的时候。 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, &compact_result);---------------------------------------------页面较大情况下,走直接页面回收来获取内存。 if (page) goto got_pg; if (gfp_mask & __GFP_NORETRY) {----------------------------------------------不做重试的情况。 if (compact_result == COMPACT_DEFERRED)----------------------------------compaction不成功,进入nopage处理。 goto nopage; compact_priority = INIT_COMPACT_PRIORITY; } } retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac);-------------------------------------------------唤醒kswapd内核线程,让其处于工作状态。 if (gfp_pfmemalloc_allowed(gfp_mask)) alloc_flags = ALLOC_NO_WATERMARKS; if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } /* Attempt with potentially adjusted zonelist and alloc_flags */ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);-----------------申请内存分配,成功则返回struct page地址。 if (page) goto got_pg; /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) {-------------------------------------------------------既不能内存规整direct compact,也无法从freelist获取内存的情况,进入nopage流程。 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; } /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) { if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { cond_resched(); goto retry; } goto nopage; } /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); if (page) goto got_pg; /* Try direct compaction and then allocating */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, compact_priority, &compact_result); if (page) goto got_pg; /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY)--------------------------------------------------------------强调不允许循环重试情况。 goto nopage; /* * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))-------------------------针对高order情况,并且不允许__GFP_REPEAT的情况,进入nopage流程。 goto nopage; /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) {------------------------------------内存分配持续时间超过stall_timeout,初始为10秒,后面以10秒递增报警。 warn_alloc(gfp_mask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; } if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; if (did_some_progress > 0 && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, &compaction_retries)) goto retry; if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry_cpuset; /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);----------------------分配页面,并且判断是否需要启动OOM killer,did_some_progress会导致retry。如果order小于3则不会进入OOM。 if (page) goto got_pg; /* Retry as long as the OOM killer is making progress */ if (did_some_progress) { no_progress_loops = 0; goto retry; } nopage: if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry_cpuset;----------------------------------------------------------------------进入retry_cpuset循环处理。 warn_alloc(gfp_mask, "page allocation failure: order:%u", order);----------------------------------------无法满足分配order大小页面。 got_pg: return page; }
下面两个函数都是vmalloc相关,__vmalloc_area_node()在分配失败之后进入fail,调用warn_alloc()输出log。
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; if (!area->pages) { remove_vm_area(area->addr); kfree(area); return NULL; } for (i = 0; i < area->nr_pages; i++) { struct page *page; if (node == NUMA_NO_NODE) page = alloc_page(alloc_mask); else page = alloc_pages_node(node, alloc_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; goto fail; } area->pages[i] = page; if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } if (map_vm_area(area, prot, pages)) goto fail; return area->addr; fail: warn_alloc(gfp_mask, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); return NULL; }
void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { struct vm_struct *area; void *addr; unsigned long real_size = size; size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node); if (!addr) return NULL; clear_vm_uninitialized_flag(area); kmemleak_alloc(addr, real_size, 2, gfp_mask); return addr; fail: warn_alloc(gfp_mask, "vmalloc: allocation failure: %lu bytes", real_size); return NULL; }
2. warn_alloc()解析
warn_alloc()首先显示相关进程和内存分配gfp_mask信息,然后打印栈信息,
void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; struct va_format vaf; va_list args; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) return; if (!(gfp_mask & __GFP_NOMEMALLOC)) if (test_thread_flag(TIF_MEMDIE) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; pr_warn("%s: ", current->comm);------------------------------------显示对应进程名称。 va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_cont("%pV", &vaf); va_end(args);------------------------------------------------------显示warn_alloc()传入的参数。 pr_cont(", mode:%#x(%pGg) ", gfp_mask, &gfp_mask);----------------显示gfp_mask。 dump_stack();------------------------------------------------------显示栈信息。 if (!should_suppress_show_mem()) show_mem(filter);----------------------------------------------显示内存信息,这里是重点。 }
show_mem()显示详细的内存信息。
void show_mem(unsigned int filter) { pg_data_t *pgdat; unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info: "); show_free_areas(filter); for_each_online_pgdat(pgdat) { unsigned long flags; int zoneid; pgdat_resize_lock(pgdat, &flags); for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { struct zone *zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; total += zone->present_pages; reserved += zone->present_pages - zone->managed_pages; if (is_highmem_idx(zoneid)) highmem += zone->present_pages; } pgdat_resize_unlock(pgdat, &flags); } printk("%lu pages RAM ", total);-------------------------------整个平台的页面统计信息:所有页面数、reserved、cma等等。 printk("%lu pages HighMem/MovableOnly ", highmem); printk("%lu pages reserved ", reserved); #ifdef CONFIG_CMA printk("%lu pages cma reserved ", totalcma_pages); #endif #ifdef CONFIG_QUICKLIST printk("%lu pages in pagetable cache ", quicklist_total_size()); #endif #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned ", atomic_long_read(&num_poisoned_pages)); #endif }
show_free_areas()从所有node、不同node、不同zone、同一zone下不同order分别显示空闲页面信息。
void show_free_areas(unsigned int filter) { unsigned long free_pcp = 0; int cpu; struct zone *zone; pg_data_t *pgdat; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; for_each_online_cpu(cpu) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu "-----------------显示所有node的统计信息。 " active_file:%lu inactive_file:%lu isolated_file:%lu " " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu " " slab_reclaimable:%lu slab_unreclaimable:%lu " " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu " " free:%lu free_pcp:%lu free_cma:%lu ", global_node_page_state(NR_ACTIVE_ANON), global_node_page_state(NR_INACTIVE_ANON), global_node_page_state(NR_ISOLATED_ANON), global_node_page_state(NR_ACTIVE_FILE), global_node_page_state(NR_INACTIVE_FILE), global_node_page_state(NR_ISOLATED_FILE), global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), global_node_page_state(NR_UNSTABLE_NFS), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), global_page_state(NR_FREE_PAGES), free_pcp, global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) {-------------------------------------------------分别显示不同node的统计信息。 printk("Node %d" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" " isolated(anon):%lukB" " isolated(file):%lukB" " mapped:%lukB" " dirty:%lukB" " writeback:%lukB" " shmem:%lukB" #ifdef CONFIG_TRANSPARENT_HUGEPAGE " shmem_thp: %lukB" " shmem_pmdmapped: %lukB" " anon_thp: %lukB" #endif " writeback_tmp:%lukB" " unstable:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" " ", pgdat->node_id, K(node_page_state(pgdat, NR_ACTIVE_ANON)), K(node_page_state(pgdat, NR_INACTIVE_ANON)), K(node_page_state(pgdat, NR_ACTIVE_FILE)), K(node_page_state(pgdat, NR_INACTIVE_FILE)), K(node_page_state(pgdat, NR_UNEVICTABLE)), K(node_page_state(pgdat, NR_ISOLATED_ANON)), K(node_page_state(pgdat, NR_ISOLATED_FILE)), K(node_page_state(pgdat, NR_FILE_MAPPED)), K(node_page_state(pgdat, NR_FILE_DIRTY)), K(node_page_state(pgdat, NR_WRITEBACK)), K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), node_page_state(pgdat, NR_PAGES_SCANNED), !pgdat_reclaimable(pgdat) ? "yes" : "no"); } for_each_populated_zone(zone) {----------------------------------------------分别显示所有zone的统计信息。 int i; if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; free_pcp = 0; for_each_online_cpu(cpu) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; show_node(zone); printk(KERN_CONT "%s" " free:%lukB" " min:%lukB" " low:%lukB" " high:%lukB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" " writepending:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" " free_cma:%lukB" " ", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); printk(KERN_CONT " "); } for_each_populated_zone(zone) {-------------------------------------------显示所有zone下不同order空闲数目统计信息。 unsigned int order; unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) {-------------------------遍历当前zone的不同order,不同order区域数目存在nr[]中,total是总的页面数目。 struct free_area *area = &zone->free_area[order]; int type; nr[order] = area->nr_free; total += nr[order] << order; types[order] = 0; for (type = 0; type < MIGRATE_TYPES; type++) { if (!list_empty(&area->free_list[type])) types[order] |= 1 << type;--------------------------------记录order区域中页面类型。 } } spin_unlock_irqrestore(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { printk(KERN_CONT "%lu*%lukB ", nr[order], K(1UL) << order);-------------------------------输出不同order区域数量和区域大小。 if (nr[order]) show_migration_types(types[order]);---------------------------输出页面类型。 } printk(KERN_CONT "= %lukB ", K(total));------------------------------显示总大小。 } hugetlb_show_meminfo();---------------------------------------------------显示huge page统计信息。 printk("%ld total pagecache pages ", global_node_page_state(NR_FILE_PAGES));---总的文件缓存页面数量。 show_swap_cache_info();----------------------------------------------------显示swap cache统计信息。 }
不同的页面有不同的属性,在warn_alloc()输出的字母对应了页面的属性。主要有M、U、E、C。
static void show_migration_types(unsigned char type) { static const char types[MIGRATE_TYPES] = { [MIGRATE_UNMOVABLE] = 'U',--------------------------不可移动。 [MIGRATE_MOVABLE] = 'M',----------------------------可移动。 [MIGRATE_RECLAIMABLE] = 'E',------------------------可回收。 [MIGRATE_HIGHATOMIC] = 'H',-------------------------等同于MIGRATE_PCPTYPES。 #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C',----------------------------CMA区域页面。 #endif #ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = 'I', #endif }; char tmp[MIGRATE_TYPES + 1]; char *p = tmp; int i; for (i = 0; i < MIGRATE_TYPES; i++) { if (type & (1 << i)) *p++ = types[i]; } *p = '