zoukankan      html  css  js  c++  java
  • Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】

    在我们使用ARM等嵌入式Linux系统的时候,一个头疼的问题是GPU,Camera,HDMI等都需要预留大量连续内存,这部分内存平时不用,但是一般的做法又必须先预留着。目前,Marek Szyprowski和Michal Nazarewicz实现了一套全新的Contiguous Memory Allocator。通过这套机制,我们可以做到不预留内存,这些内存平时是可用的,只有当需要的时候才被分配给Camera,HDMI等设备。下面分析它的基本代码流程。

    1. 声明连续内存

    内核启动过程中arch/arm/mm/init.c中的arm_memblock_init()会调用dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));

    该函数位于:drivers/base/dma-contiguous.c

    /**
     * dma_contiguous_reserve() - reserve area for contiguous memory handling
     * @limit: End address of the reserved memory (optional, 0 for any).
     *
     * This function reserves memory from early allocator. It should be
     * called by arch specific code once the early allocator (memblock or bootmem)
     * has been activated and all other subsystems have already allocated/reserved
     * memory.
     */
    void __init dma_contiguous_reserve(phys_addr_t limit)
    {
            unsigned long selected_size = 0;
     
            pr_debug("%s(limit %08lx)
    ", __func__, (unsigned long)limit);
     
            if (size_cmdline != -1) {
                    selected_size = size_cmdline;
            } else {
    #ifdef CONFIG_CMA_SIZE_SEL_MBYTES
                    selected_size = size_bytes;
    #elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
                    selected_size = cma_early_percent_memory();
    #elif defined(CONFIG_CMA_SIZE_SEL_MIN)
                    selected_size = min(size_bytes, cma_early_percent_memory());
    #elif defined(CONFIG_CMA_SIZE_SEL_MAX)
                    selected_size = max(size_bytes, cma_early_percent_memory());
    #endif
            }   
     
            if (selected_size) {
                    pr_debug("%s: reserving %ld MiB for global area
    ", __func__,
                             selected_size / SZ_1M);
     
                    dma_declare_contiguous(NULL, selected_size, 0, limit);
            }   
    

    其中的size_bytes定义为:

    static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M
    

    默认情况下,CMA_SIZE_MBYTES会被定义为16MB,来源于CONFIG_CMA_SIZE_MBYTES=16

    int __init dma_declare_contiguous(struct device *dev, unsigned long size,
                                      phys_addr_t base, phys_addr_t limit)
    {
            ...
            /* Reserve memory */
            if (base) {
                    if (memblock_is_region_reserved(base, size) ||
                        memblock_reserve(base, size) < 0) {
                            base = -EBUSY;
                            goto err;
                    }
            } else {
                    /*
                     * Use __memblock_alloc_base() since
                     * memblock_alloc_base() panic()s.
                     */
                    phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
                    if (!addr) {
                            base = -ENOMEM;
                            goto err;
                    } else if (addr + size > ~(unsigned long)0) {
                            memblock_free(addr, size);
                            base = -EINVAL;
                            base = -EINVAL;
                            goto err;
                    } else {
                            base = addr;
                    }
            }
     
            /*
             * Each reserved area must be initialised later, when more kernel
             * subsystems (like slab allocator) are available.
             */
            r->start = base;
            r->size = size;
            r->dev = dev;
            cma_reserved_count++;
            pr_info("CMA: reserved %ld MiB at %08lx
    ", size / SZ_1M,
                    (unsigned long)base);
     
            /* Architecture specific contiguous memory fixup. */
            dma_contiguous_early_fixup(base, size);
            return 0;
    err:
            pr_err("CMA: failed to reserve %ld MiB
    ", size / SZ_1M);
            return base;
    } 
    

    由此可见,连续内存区域也是在内核启动的早期,通过__memblock_alloc_base()拿到的。

    另外:

    drivers/base/dma-contiguous.c里面的core_initcall()会导致cma_init_reserved_areas()被调用:

    cma_create_area()会调用cma_activate_area(),cma_activate_area()函数则会针对每个page调用:

    init_cma_reserved_pageblock(pfn_to_page(base_pfn));

    这个函数则会通过set_pageblock_migratetype(page, MIGRATE_CMA)将页设置为MIGRATE_CMA类型的:

    #ifdef CONFIG_CMA
    /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
    void __init init_cma_reserved_pageblock(struct page *page)
    {                                    
            unsigned i = pageblock_nr_pages;
            struct page *p = page;
            
            do {
                    __ClearPageReserved(p);
                    set_page_count(p, 0);
            } while (++p, --i);
            
            set_page_refcounted(page);
            set_pageblock_migratetype(page, MIGRATE_CMA);
            __free_pages(page, pageblock_order);
            totalram_pages += pageblock_nr_pages;
    }       
    #endif
    

    同时其中调用的__free_pages(page, pageblock_order);最终会调用到__free_one_page(page, zone, order, migratetype);

    相关的page会被加到MIGRATE_CMA的free_list上面去:

    list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

    2. 申请连续内存

    申请连续内存仍然使用标准的arch/arm/mm/dma-mapping.c中定义的dma_alloc_coherent()和dma_alloc_writecombine(),这二者会间接调用drivers/base/dma-contiguous.c中的

    struct page *dma_alloc_from_contiguous(struct device *dev, int count,
                                           unsigned int align)
    

    ->

    struct page *dma_alloc_from_contiguous(struct device *dev, int count,
                                           unsigned int align)
    {
           ...
     
           for (;;) {
                    pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,
                                                        start, count, mask);
                    if (pageno >= cma->count) {
                            ret = -ENOMEM;
                            goto error;
                    }
     
                    pfn = cma->base_pfn + pageno;
                    ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
                    if (ret == 0) {
                            bitmap_set(cma->bitmap, pageno, count);
                            break;
                    } else if (ret != -EBUSY) {
                            goto error;
                    }
                    pr_debug("%s(): memory range at %p is busy, retrying
    ",
                             __func__, pfn_to_page(pfn));
                    /* try again with a bit different memory target */
                    start = pageno + mask + 1;
            }
           ...
     
    }
    

    --》

    int alloc_contig_range(unsigned long start, unsigned long end,
    
                           unsigned migratetype)
    

    需要隔离page,隔离page的作用通过代码的注释可以体现:

     /*
             * What we do here is we mark all pageblocks in range as
             * MIGRATE_ISOLATE.  Because of the way page allocator work, we
             * align the range to MAX_ORDER pages so that page allocator
             * won't try to merge buddies from different pageblocks and
             * change MIGRATE_ISOLATE to some other migration type.
             *
             * Once the pageblocks are marked as MIGRATE_ISOLATE, we
             * migrate the pages from an unaligned range (ie. pages that
             * we are interested in).  This will put all the pages in
             * range back to page allocator as MIGRATE_ISOLATE.
             *
             * When this is done, we take the pages in range from page
             * allocator removing them from the buddy system.  This way
             * page allocator will never consider using them.
             *
             * This lets us mark the pageblocks back as
             * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
             * MAX_ORDER aligned range but not in the unaligned, original
             * range are put back to page allocator so that buddy can use
             * them. 
             */  
                    
            ret = start_isolate_page_range(pfn_align_to_maxpage_down(start),
                                           pfn_align_to_maxpage_up(end),
                                           migratetype);
    

    简单地说,就是把相关的page标记为MIGRATE_ISOLATE,这样buddy系统就不会再使用他们。

    /*      
     * start_isolate_page_range() -- make page-allocation-type of range of pages
     * to be MIGRATE_ISOLATE.
     * @start_pfn: The lower PFN of the range to be isolated.
     * @end_pfn: The upper PFN of the range to be isolated.
     * @migratetype: migrate type to set in error recovery.
     *
     * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
     * the range will never be allocated. Any free pages and pages freed in the
     * future will not be allocated again.
     *
     * start_pfn/end_pfn must be aligned to pageblock_order.
     * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
     */
    int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
                                 unsigned migratetype)
    {
            unsigned long pfn;
            unsigned long undo_pfn;
            struct page *page;
     
            BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
            BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
     
            for (pfn = start_pfn;
                 pfn < end_pfn;
                 pfn += pageblock_nr_pages) {
                    page = __first_valid_page(pfn, pageblock_nr_pages);
                    if (page && set_migratetype_isolate(page)) {
                            undo_pfn = pfn;
                            goto undo;
                    }
            }
            return 0;
    undo:
            for (pfn = start_pfn;
                 pfn < undo_pfn;
                 pfn += pageblock_nr_pages)
                    unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
     
            return -EBUSY;
    }
    

    接下来调用__alloc_contig_migrate_range()进行页面隔离和迁移:

    static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) 
    {
            /* This function is based on compact_zone() from compaction.c. */
     
            unsigned long pfn = start;
            unsigned int tries = 0; 
            int ret = 0; 
     
            struct compact_control cc = {
                    .nr_migratepages = 0, 
                    .order = -1,
                    .zone = page_zone(pfn_to_page(start)),
                    .sync = true,
            };   
            INIT_LIST_HEAD(&cc.migratepages);
     
            migrate_prep_local();
     
            while (pfn < end || !list_empty(&cc.migratepages)) {
                    if (fatal_signal_pending(current)) {
                            ret = -EINTR;
                            break;
                    }    
     
                    if (list_empty(&cc.migratepages)) {
                            cc.nr_migratepages = 0; 
                            pfn = isolate_migratepages_range(cc.zone, &cc, 
                                                             pfn, end);
                            if (!pfn) {
                                    ret = -EINTR;
                                    break;
                            }    
                            tries = 0; 
                    } else if (++tries == 5) { 
                            ret = ret < 0 ? ret : -EBUSY;
                            break;
                    }    
     
                    ret = migrate_pages(&cc.migratepages,
                                        __alloc_contig_migrate_alloc,
                                        0, false, true);
            }    
     
            putback_lru_pages(&cc.migratepages);
            return ret > 0 ? 0 : ret; 
    }
    
    

    其中的函数migrate_pages()会完成页面的迁移,迁移过程中通过传入的__alloc_contig_migrate_alloc()申请新的page,并将老的page付给新的page:

    int migrate_pages(struct list_head *from,
                    new_page_t get_new_page, unsigned long private, bool offlining,
                    bool sync)
    {
            int retry = 1; 
            int nr_failed = 0; 
            int pass = 0; 
            struct page *page;
            struct page *page2;
            int swapwrite = current->flags & PF_SWAPWRITE;
            int rc;
     
            if (!swapwrite)
                    current->flags |= PF_SWAPWRITE;
     
            for(pass = 0; pass < 10 && retry; pass++) {
                    retry = 0; 
     
                    list_for_each_entry_safe(page, page2, from, lru) {
                            cond_resched();
     
                            rc = unmap_and_move(get_new_page, private,
                                                    page, pass > 2, offlining,
                                                    sync);
     
                            switch(rc) {
                            case -ENOMEM:
                                    goto out; 
                            case -EAGAIN:
                                    retry++;
                                    break;
                            case 0:
                                    break;
                            default:
                                    /* Permanent failure */
                                    nr_failed++;
                                    break;
                            }    
                    }    
            }    
            rc = 0;
    ...
    } 
    

    其中的unmap_and_move()函数较为关键,它定义在mm/migrate.c中

    /*
     * Obtain the lock on page, remove all ptes and migrate the page
     * to the newly allocated page in newpage.
     */
    static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                struct page *page, int force, bool offlining, bool sync)
    {
        int rc = 0;
        int *result = NULL;
        struct page *newpage = get_new_page(page, private, &result);
        int remap_swapcache = 1;
        int charge = 0;
        struct mem_cgroup *mem = NULL;
        struct anon_vma *anon_vma = NULL;
     
        ...
     
        /* charge against new page */
        charge = mem_cgroup_prepare_migration(page, newpage, &mem);
        ...
     
        if (PageWriteback(page)) {
            if (!force || !sync)
                goto uncharge;
            wait_on_page_writeback(page);
        }
        /*
         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
         * we cannot notice that anon_vma is freed while we migrates a page.
         * This get_anon_vma() delays freeing anon_vma pointer until the end
         * of migration. File cache pages are no problem because of page_lock()
         * File Caches may use write_page() or lock_page() in migration, then,
         * just care Anon page here.
         */
        if (PageAnon(page)) {
            /*
             * Only page_lock_anon_vma() understands the subtleties of
             * getting a hold on an anon_vma from outside one of its mms.
             */
            anon_vma = page_lock_anon_vma(page);
            if (anon_vma) {
                /*
                 * Take a reference count on the anon_vma if the
                 * page is mapped so that it is guaranteed to
                 * exist when the page is remapped later
                 */
                get_anon_vma(anon_vma);
                page_unlock_anon_vma(anon_vma);
            } else if (PageSwapCache(page)) {
                /*
                 * We cannot be sure that the anon_vma of an unmapped
                 * swapcache page is safe to use because we don't
                 * know in advance if the VMA that this page belonged
                 * to still exists. If the VMA and others sharing the
                 * data have been freed, then the anon_vma could
                 * already be invalid.
                 *
                 * To avoid this possibility, swapcache pages get
                 * migrated but are not remapped when migration
                 * completes
                 */
                remap_swapcache = 0;
            } else {
                goto uncharge;
            }
        }
     
        ...
        /* Establish migration ptes or remove ptes */
        try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
     
    skip_unmap:
        if (!page_mapped(page))
            rc = move_to_new_page(newpage, page, remap_swapcache);
     
        if (rc && remap_swapcache)
            remove_migration_ptes(page, page);
     
        /* Drop an anon_vma reference if we took one */
        if (anon_vma)
            drop_anon_vma(anon_vma);
     
    uncharge:
        if (!charge)
            mem_cgroup_end_migration(mem, page, newpage, rc == 0);
    unlock:
        unlock_page(page);
     
    move_newpage:
        ...
    }
    
    

    通过unmap_and_move(),老的page就被迁移过去新的page。

    接下来要回收page,回收page的作用是,不至于因为拿了连续的内存后,系统变得内存饥饿:

    ->

    /*
             * Reclaim enough pages to make sure that contiguous allocation
             * will not starve the system.
             */
            __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
    

    ->

    /*
     * Trigger memory pressure bump to reclaim some pages in order to be able to
     * allocate 'count' pages in single page units. Does similar work as
     *__alloc_pages_slowpath() function.
     */
    static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
    {
            enum zone_type high_zoneidx = gfp_zone(gfp_mask);
            struct zonelist *zonelist = node_zonelist(0, gfp_mask);
            int did_some_progress = 0;
            int order = 1;
            unsigned long watermark;
     
            /*
             * Increase level of watermarks to force kswapd do his job
             * to stabilise at new watermark level.
             */
            __update_cma_watermarks(zone, count);
     
            /* Obey watermarks as if the page was being allocated */
            watermark = low_wmark_pages(zone) + count;
            while (!zone_watermark_ok(zone, 0, watermark, 0, 0)) {
                    wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
     
                    did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
                                                          NULL);
                    if (!did_some_progress) {
                            /* Exhausted what can be done so it's blamo time */
                            out_of_memory(zonelist, gfp_mask, order, NULL);
                    }
            }
     
            /* Restore original watermark levels. */
            __update_cma_watermarks(zone, -count);
     
            return count;
    }
    

    3. 释放连续内存

    内存释放的时候也比较简单,直接就是:

    arch/arm/mm/dma-mapping.c

    void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)
    

    ->

    arch/arm/mm/dma-mapping.c:

    static void __free_from_contiguous(struct device *dev, struct page *page,
                                       size_t size)
    {
            __dma_remap(page, size, pgprot_kernel);
            dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
    }
    

    ->

    bool dma_release_from_contiguous(struct device *dev, struct page *pages,
                                     int count)
    {
            ...
            free_contig_range(pfn, count);
            ..
     
    }
    

    ->

    void free_contig_range(unsigned long pfn, unsigned nr_pages)
    {       
            for (; nr_pages--; ++pfn)
                    __free_page(pfn_to_page(pfn));
    }  
    

    将page交还给buddy。

    4. 内核内存分配的migratetype

    内核内存分配的时候,带的标志是GFP_,但是GFP_可以转化为migratetype:

    static inline int allocflags_to_migratetype(gfp_t gfp_flags)
    {
            WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
     
            if (unlikely(page_group_by_mobility_disabled))
                    return MIGRATE_UNMOVABLE;
     
            /* Group based on mobility */
            return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
                    ((gfp_flags & __GFP_RECLAIMABLE) != 0); 
    }
    

    之后申请内存的时候,会对比迁移类型匹配的free_list:

            page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                            zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
                            preferred_zone, migratetype);
    

    另外,笔者也编写了一个测试程序,透过它随时测试CMA的功能:

    /*
     * kernel module helper for testing CMA
     *
     * Licensed under GPLv2 or later.
     */
     
    #include <linux/module.h>
    #include <linux/device.h>
    #include <linux/fs.h>
    #include <linux/miscdevice.h>
    #include <linux/dma-mapping.h>
     
    #define CMA_NUM  10
    static struct device *cma_dev;
    static dma_addr_t dma_phys[CMA_NUM];
    static void *dma_virt[CMA_NUM];
     
    /* any read request will free coherent memory, eg.
     * cat /dev/cma_test
     */
    static ssize_t
    cma_test_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
    {
    	int i;
     
    	for (i = 0; i < CMA_NUM; i++) {
    		if (dma_virt[i]) {
    			dma_free_coherent(cma_dev, (i + 1) * SZ_1M, dma_virt[i], dma_phys[i]);
    			_dev_info(cma_dev, "free virt: %p phys: %p
    ", dma_virt[i], (void *)dma_phys[i]);
    			dma_virt[i] = NULL;
    			break;
    		}
    	}
    	return 0;
    }
     
    /*
     * any write request will alloc coherent memory, eg.
     * echo 0 > /dev/cma_test
     */
    static ssize_t
    cma_test_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
    {
    	int i;
    	int ret;
     
    	for (i = 0; i < CMA_NUM; i++) {
    		if (!dma_virt[i]) {
    			dma_virt[i] = dma_alloc_coherent(cma_dev, (i + 1) * SZ_1M, &dma_phys[i], GFP_KERNEL);
     
    			if (dma_virt[i]) {
    				void *p;
    				/* touch every page in the allocated memory */
    				for (p = dma_virt[i]; p <  dma_virt[i] + (i + 1) * SZ_1M; p += PAGE_SIZE)
    					*(u32 *)p = 0;
     
    				_dev_info(cma_dev, "alloc virt: %p phys: %p
    ", dma_virt[i], (void *)dma_phys[i]);
    			} else {
    				dev_err(cma_dev, "no mem in CMA area
    ");
    				ret = -ENOMEM;
    			}
    			break;
    		}
    	}
     
    	return count;
    }
     
    static const struct file_operations cma_test_fops = {
    	.owner =    THIS_MODULE,
    	.read  =    cma_test_read,
    	.write =    cma_test_write,
    };
     
    static struct miscdevice cma_test_misc = {
    	.name = "cma_test",
    	.fops = &cma_test_fops,
    };
     
    static int __init cma_test_init(void)
    {
    	int ret = 0;
     
    	ret = misc_register(&cma_test_misc);
    	if (unlikely(ret)) {
    		pr_err("failed to register cma test misc device!
    ");
    		return ret;
    	}
    	cma_dev = cma_test_misc.this_device;
    	cma_dev->coherent_dma_mask = ~0;
    	_dev_info(cma_dev, "registered.
    ");
     
    	return ret;
    }
    module_init(cma_test_init);
     
    static void __exit cma_test_exit(void)
    {
    	misc_deregister(&cma_test_misc);
    }
    module_exit(cma_test_exit);
     
    MODULE_LICENSE("GPL");
    MODULE_AUTHOR("Barry Song <21cnbao@gmail.com>");
    MODULE_DESCRIPTION("kernel module to help the test of CMA");
    MODULE_ALIAS("CMA test");
    

    申请内存:

    # echo 0 > /dev/cma_test
    

    释放内存:

    # cat /dev/cma_test
    
  • 相关阅读:
    java_监控工具jvisualvm
    bzoj3667: Rabin-Miller算法
    bzoj3677: [Apio2014]连珠线
    4070: [Apio2015]雅加达的摩天楼
    4069: [Apio2015]巴厘岛的雕塑
    4071: [Apio2015]巴邻旁之桥
    bzoj2653: middle
    1500: [NOI2005]维修数列
    bzoj4262: Sum
    bzoj4540: [Hnoi2016]序列
  • 原文地址:https://www.cnblogs.com/linhaostudy/p/10174653.html
Copyright © 2011-2022 走看看