用户空间缺页异常pte_handle_fault()分析--(下)--写时复制【转】

zoukankan html css js c++ java

用户空间缺页异常pte_handle_fault()分析--(下)--写时复制【转】
转自：http://blog.csdn.net/vanbreaker/article/details/7955713

版权声明：本文为博主原创文章，未经博主允许不得转载。
       在pte_handle_fault()中，如果触发异常的页存在于主存中，那么该异常往往是由写了一个只读页触发的，此时需要进行COW(写时复制操作)。如当一个父进程通过fork()创建了一个子进程时，子进程将会共享父进程的页框。之后，无论是父进程还是子进程要对相应的内存进行写操作，都要进行COW，也就是为自己重新分配一个页框，并把之前的数据复制到页框中去，再写。

[cpp] view plain copy

static inline int handle_pte_fault(struct mm_struct *mm,

        struct vm_area_struct *vma, unsigned long address,

        pte_t *pte, pmd_t *pmd, unsigned int flags)

{

    pte_t entry;

    spinlock_t *ptl;



    entry = *pte;



    ...

    ...

    ...

    /********页在主存中的情况***********/



    ptl = pte_lockptr(mm, pmd);

    spin_lock(ptl);

    if (unlikely(!pte_same(*pte, entry)))

        goto unlock;

    if (flags & FAULT_FLAG_WRITE) {//异常由写访问触发

        if (!pte_write(entry))//而对应的页是不可写的

            return do_wp_page(mm, vma, address, //此时必须进行写时复制的操作

                    pte, pmd, ptl, entry);

        entry = pte_mkdirty(entry);

    }

    entry = pte_mkyoung(entry);

    if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {

        update_mmu_cache(vma, address, entry);

    } else {

        /*

         * This is needed only for protection faults but the arch code

         * is not yet telling us if this is a protection fault or not.

         * This still avoids useless tlb flushes for .text page faults

         * with threads.

         */

        if (flags & FAULT_FLAG_WRITE)

            flush_tlb_page(vma, address);

    }

unlock:

    pte_unmap_unlock(pte, ptl);

    return 0;

}

可以看到，hand_pte_fault()函数处理页存在于主存中的情况的关键操作都集中在do_wp_page()函数上。该函数是用来处理COW的，不过在COW之前先要做一些检查，比如说，如果对应的页只有一个进程使用，那么便可以直接修改页的权限为可读可写，而不进行COW。总之，不到不得以的情况下是不会进行COW的。

[cpp] view plain copy

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,

        unsigned long address, pte_t *page_table, pmd_t *pmd,

        spinlock_t *ptl, pte_t orig_pte)

{

    struct page *old_page, *new_page;

    pte_t entry;

    int reuse = 0, ret = 0;

    int page_mkwrite = 0;

    struct page *dirty_page = NULL;



    old_page = vm_normal_page(vma, address, orig_pte);//获取共享页

    if (!old_page) {//获取共享页失败

        /*

         * VM_MIXEDMAP !pfn_valid() case

         *

         * We should not cow pages in a shared writeable mapping.

         * Just mark the pages writable as we can't do any dirty

         * accounting on raw pfn maps.

         */

         /*如果vma的映射本来就是共享且可写的，则跳转至reuse直接使用orig_pte对应的页*/

        if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

                     (VM_WRITE|VM_SHARED))

            goto reuse;

        /*否则跳转至gotten分配一个页*/

        goto gotten;

    }



    /*

     * Take out anonymous pages first, anonymous shared vmas are

     * not dirty accountable.

     */

     /*下面首先判断匿名页的情况，如果old_page是匿名页，并且只有一个进程使用它(reuse为1)，则

        则直接使用该页*/

    if (PageAnon(old_page) && !PageKsm(old_page)) {

        /*这里先判断是否有其他进程竞争，修改了页表*/

        if (!trylock_page(old_page)) {

            page_cache_get(old_page);

            pte_unmap_unlock(page_table, ptl);

            lock_page(old_page);

            page_table = pte_offset_map_lock(mm, pmd, address,

                             &ptl);

            if (!pte_same(*page_table, orig_pte)) {

                unlock_page(old_page);

                page_cache_release(old_page);

                goto unlock;

            }

            page_cache_release(old_page);

        }

        /*确定没有其他进程竞争，则进行reuse判断，通过reuse_swap_page()函数判断

         old_page的_mapcount字段是否为0，是的话则表明只有一个进程使用该匿名页*/

        reuse = reuse_swap_page(old_page);

        unlock_page(old_page);

    } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

                    (VM_WRITE|VM_SHARED))) {//如果vma的映射本来就是共享且可写的

        /*

         * Only catch write-faults on shared writable pages,

         * read-only shared pages can get COWed by

         * get_user_pages(.write=1, .force=1).

         */

        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {

            struct vm_fault vmf;

            int tmp;



            vmf.virtual_address = (void __user *)(address &

                                PAGE_MASK);

            vmf.pgoff = old_page->index;

            vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

            vmf.page = old_page;



            /*

             * Notify the address space that the page is about to

             * become writable so that it can prohibit this or wait

             * for the page to get into an appropriate state.

             *

             * We do this without the lock held, so that it can

             * sleep if it needs to.

             */

            page_cache_get(old_page);//增加old_page的引用计数作为保护

            pte_unmap_unlock(page_table, ptl);



            /*这里通知即将修改页的权限*/

            tmp = vma->vm_ops->page_mkwrite(vma, &vmf);



            /*如果无法修改的话，则跳转到unwritable_page*/

            if (unlikely(tmp &

                    (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {

                ret = tmp;

                goto unwritable_page;

            }

            if (unlikely(!(tmp & VM_FAULT_LOCKED))) {

                lock_page(old_page);

                if (!old_page->mapping) {

                    ret = 0; /* retry the fault */

                    unlock_page(old_page);

                    goto unwritable_page;

                }

            } else

                VM_BUG_ON(!PageLocked(old_page));



            /*

             * Since we dropped the lock we need to revalidate

             * the PTE as someone else may have changed it.  If

             * they did, we just return, as we can count on the

             * MMU to tell us if they didn't also make it writable.

             */

             /*走到这里表示已经成功修改了页的权限了，这里同样重新获取页表，判断是否和之前一致*/

            page_table = pte_offset_map_lock(mm, pmd, address,

                             &ptl);

            if (!pte_same(*page_table, orig_pte)) {

                unlock_page(old_page);

                page_cache_release(old_page);

                goto unlock;

            }



            page_mkwrite = 1;

        }

        dirty_page = old_page;

        get_page(dirty_page);

        reuse = 1;

    }



    if (reuse) {//reuse处理，也就是说不进行COW，可以直接在old_page上进行写操作

reuse:

        flush_cache_page(vma, address, pte_pfn(orig_pte));

        entry = pte_mkyoung(orig_pte);//标记_PAGE_ACCESSED位

        entry = maybe_mkwrite(pte_mkdirty(entry), vma);//将页的权限修改为可读可写，并且标记为脏页

        if (ptep_set_access_flags(vma, address, page_table, entry,1))

            update_mmu_cache(vma, address, entry);

        ret |= VM_FAULT_WRITE;

        goto unlock;

    }



    /*

     * Ok, we need to copy. Oh, well..

     */

     /***************终于走到了不得已的一步了，下面只好进行COW了********************/

    page_cache_get(old_page);

gotten:

    pte_unmap_unlock(page_table, ptl);



    if (unlikely(anon_vma_prepare(vma)))

        goto oom;



    if (is_zero_pfn(pte_pfn(orig_pte))) {

        new_page = alloc_zeroed_user_highpage_movable(vma, address);//分配一个零页面

        if (!new_page)

            goto oom;

    } else {

        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);//分配一个非零页面

        if (!new_page)

            goto oom;

        cow_user_page(new_page, old_page, address, vma);//将old_page中的数据拷贝到new_page

    }

    __SetPageUptodate(new_page);



    /*

     * Don't let another task, with possibly unlocked vma,

     * keep the mlocked page.

     */

    if ((vma->vm_flags & VM_LOCKED) && old_page) {

        lock_page(old_page);    /* for LRU manipulation */

        clear_page_mlock(old_page);

        unlock_page(old_page);

    }



    if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))

        goto oom_free_new;



    /*

     * Re-check the pte - we dropped the lock

     */

    page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

    if (likely(pte_same(*page_table, orig_pte))) {

        if (old_page) {

            if (!PageAnon(old_page)) {

                dec_mm_counter(mm, file_rss);

                inc_mm_counter(mm, anon_rss);

            }

        } else

            inc_mm_counter(mm, anon_rss);

        flush_cache_page(vma, address, pte_pfn(orig_pte));

        entry = mk_pte(new_page, vma->vm_page_prot);//获取new_page的pte

        entry = maybe_mkwrite(pte_mkdirty(entry), vma);//修改new_page的权限

        /*

         * Clear the pte entry and flush it first, before updating the

         * pte with the new entry. This will avoid a race condition

         * seen in the presence of one thread doing SMC and another

         * thread doing COW.

         */

        ptep_clear_flush(vma, address, page_table);

        page_add_new_anon_rmap(new_page, vma, address);

        /*

         * We call the notify macro here because, when using secondary

         * mmu page tables (such as kvm shadow page tables), we want the

         * new page to be mapped directly into the secondary page table.

         */

        set_pte_at_notify(mm, address, page_table, entry);

        update_mmu_cache(vma, address, entry);

        if (old_page) {

            /*

             * Only after switching the pte to the new page may

             * we remove the mapcount here. Otherwise another

             * process may come and find the rmap count decremented

             * before the pte is switched to the new page, and

             * "reuse" the old page writing into it while our pte

             * here still points into it and can be read by other

             * threads.

             *

             * The critical issue is to order this

             * page_remove_rmap with the ptp_clear_flush above.

             * Those stores are ordered by (if nothing else,)

             * the barrier present in the atomic_add_negative

             * in page_remove_rmap.

             *

             * Then the TLB flush in ptep_clear_flush ensures that

             * no process can access the old page before the

             * decremented mapcount is visible. And the old page

             * cannot be reused until after the decremented

             * mapcount is visible. So transitively, TLBs to

             * old page will be flushed before it can be reused.

             */

            page_remove_rmap(old_page);

        }



        /* Free the old page.. */

        new_page = old_page;

        ret |= VM_FAULT_WRITE;

    } else

        mem_cgroup_uncharge_page(new_page);



    if (new_page)

        page_cache_release(new_page);

    if (old_page)

        page_cache_release(old_page);

unlock:

    pte_unmap_unlock(page_table, ptl);

    if (dirty_page) {

        /*

         * Yes, Virginia, this is actually required to prevent a race

         * with clear_page_dirty_for_io() from clearing the page dirty

         * bit after it clear all dirty ptes, but before a racing

         * do_wp_page installs a dirty pte.

         *

         * do_no_page is protected similarly.

         */

        if (!page_mkwrite) {

            wait_on_page_locked(dirty_page);

            set_page_dirty_balance(dirty_page, page_mkwrite);

        }

        put_page(dirty_page);

        if (page_mkwrite) {

            struct address_space *mapping = dirty_page->mapping;



            set_page_dirty(dirty_page);

            unlock_page(dirty_page);

            page_cache_release(dirty_page);

            if (mapping)    {

                /*

                 * Some device drivers do not set page.mapping

                 * but still dirty their pages

                 */

                balance_dirty_pages_ratelimited(mapping);

            }

        }



        /* file_update_time outside page_lock */

        if (vma->vm_file)

            file_update_time(vma->vm_file);

    }

    return ret;

oom_free_new:

    page_cache_release(new_page);

oom:

    if (old_page) {

        if (page_mkwrite) {

            unlock_page(old_page);

            page_cache_release(old_page);

        }

        page_cache_release(old_page);

    }

    return VM_FAULT_OOM;



unwritable_page:

    page_cache_release(old_page);

    return ret;

}
查看全文

相关阅读:
Java GUI学习心得
 Kettle6.0表输入连接数据库
 理解javascript继承 Minoz
理解作用域 Minoz
深入理解javascript作用域链 Minoz
JavaScript数组总结 Minoz
收获2.css圆角总结 Minoz
一次前端作业的收获 Minoz
深入理解闭包 Minoz
理解javascript原型与原型链 Minoz

原文地址：https://www.cnblogs.com/sky-heaven/p/5663397.html