zoukankan      html  css  js  c++  java
  • mmap 从 glibc 到 kernel 的实现

    昨晚脑子抽了,突然想扒扒 mmap

    映射文件

    刚刚开始我是尝试 mmap 一个文件,实现对内存的读写达到读写文件内容的目的,踩了几个坑

    #include<stdio.h>
    #include<unistd.h>
    #include<sys/mman.h>
    #include<sys/stat.h>
    #include<sys/types.h>
    #include<fcntl.h>
    #include<string.h>
    
    int main(int argc, char *argv[])
    {
            struct stat st;
            void *ptr;
            int filesize;
    
            int fd = open("bin", O_RDWR);
            fstat(fd, &st);
            filesize = st.st_size;
            printf("%d
    ", filesize);
    
            ptr = mmap(NULL, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
            close(fd);
    
            printf("%p
    ", ptr);
            memset(ptr, 0x46, 10);
            msync(ptr, filesize, MS_SYNC);
            munmap(ptr, filesize);
            return 0;
    }
    
    1. 我想的是读写,然后我看文章的时候我看到的是:用 open 得到一个 文件描述符,demo 上写的是

      int fd = open("xxxx", O_RDONLY);
      

      我信了然后我也照写,后面我发现我怎么都不能映射成功,后面我才意识到 open 时我用的只读,然后 mmap 时用读写

      ptr = mmap(NULL, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      
    2. 还有就是同步的问题,原本我的理解是对 ptr 指向的进行内存读写,系统会自动同步到文件,然而不是。一般来说,进程在映射空间中对共享内容的修改并不会直接写回到磁盘文件中,往往在调用munmap()之后才会同步输出到磁盘文件中.那么,在程序运行过 程中,在调用munmap()之前,可以通过调用msync()来实现磁盘上文件内容与共享内存区中的内容与一致,或者是把对共享内存区的修改同步输出到 磁盘文件中。

    代码注入

    完成后我还试了开一个 PROT_EXEC 权限的内存块去做代码注入

    这里我又踩到坑,我在写 shellcode 的时候,可能是我太 sb 了,我是这样写的:

    section .data
    	msg db      "hello, world!
    "
    section .text
        global _start
    _start:
        mov     rax, 1
        mov     rdi, 1
        mov     rsi, msg
        mov     rdx, 13
        syscall
        mov    rax, 60
        mov    rdi, 0
        syscall
    

    编译

    nasm -g -f elf64 shellcode.asm
    ld shellcode.o -o shellocde
    

    确实可以运行,然后我用 objdump 抠 shellcode 的时候发现,我没法抠下 msg ,因为它在 data 段,我只能抠 text 段的,我又想到把 msg 放在 text 段,但是我怎么去拿到它的地址?

    后面我才知道有个技巧

    via:https://stackoverflow.com/questions/15593214/linux-shellcode-hello-world

    然后我把代码改成:

    section .text
        global _start
    _start:
        call _exec
        msg db      "hello, world!
    "
        
    _exec:
        mov     rax, 1
        mov     rdi, 1
        pop     rsi
        mov     rdx, 13
        syscall
        mov    rax, 60
        mov    rdi, 0
        syscall
    

    其实这个,利用了 call 指令把 msg 的地址压栈

    还是挺好理解的,就是 call 的时候会把下一条指令的地址当成 返回地址 压栈,这样在使用 pop 的时候就能得到这个地址

    编译,抠 代码 :

    echo "xe8x11x00x00x00x68x65x6cx6cx6fx2cx20x77x6fx72x6cx64x21x5cx72x5cx6exb8x01x00x00x00xbfx01x00x00x00x5exbax0dx00x00x00x0fx05xb8x3cx00x00x00xbfx00x00x00x00x0fx05" > elf64
    

    c source:

    #include<stdio.h>
    #include<unistd.h>
    #include<sys/mman.h>
    #include<sys/stat.h>
    #include<sys/types.h>
    #include<fcntl.h>
    #include<string.h>
    
    int main(int argc, char *argv[])
    {
            typedef void (*shellcode_t)();
            struct stat st;
            void *ptr;
            int filesize;
    
            int fp = open("elf64", O_RDWR);
            fstat(fp, &st);
            filesize = st.st_size;
            printf("%d
    ", filesize);
    
            ptr = mmap(NULL, filesize, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_SHARED, fp, 0);
            close(fp);
            printf("%p
    ", ptr);
            
            shellcode_t f = (shellcode_t)ptr;
            f();
            
            munmap(ptr, filesize);
            return 0;
    }
    

    父子进程通讯

    这里我用的是 mmap 匿名映射,所谓匿名映射,表示不存在 fd 这么个真实的文件,不需要映射真实文件

    还有一种方法就是映射 /dev/zero

    /dev/zero 是一个特殊的文件,当你读它的时候,它会提供无限的空字符(NULL, ASCII NUL, 0x00)
    一个作用是用它作为源,产生一个特定大小的空白文件

    #include<stdio.h>
    #include<unistd.h>
    #include<sys/mman.h>
    #include<sys/stat.h>
    #include<sys/types.h>
    
    int main(int argc, char *argv[])
    {
            char *ptr;
            ptr = (char *)mmap(NULL, 100, PROT_READ | PROT_WRITE, MAP_SHARED|MAP_ANON, -1, 0);
    
            printf("%p
    ", ptr);
            if(fork() == 0) {
                    sleep(1);
                    printf("chlid got a message: %s
    ", ptr);
                    sprintf(ptr, "%s", "I am your son");
                    munmap(ptr, 100);
            }
    
            sprintf(ptr, "%s" ,"I am your father");
            sleep(3);
            printf("father got a message: %s
    ", ptr);
            return 0;
    }
    

    这个原理就是,fork 是子进程会 复制(当然是写时复制 COW,这个不展开) 父进程的东西,但是 mmap 得到的这块内存是父子进程共同维护的,对这块内存进行读写就能实现进程通信

    源码分析

    用法大概试了个遍,我直接追源码

    写了个 demo 配合自己编译的 libc

    #include<stdio.h>
    #include<sys/mman.h>
    #include<unistd.h>
    
    int main()
    {
            void *p = mmap(NULL, 100, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
            printf("%p
    ", p);
            return 0;
    }
    

    gdb 调试

    其实在用户层没有什么东西,看 diasm 窗口最后就是使用一个系统调用:

       0x7ffff7b0e8fa <mmap64+42>    mov    r9, rbx
       0x7ffff7b0e8fd <mmap64+45>    mov    r8d, r15d
       0x7ffff7b0e900 <mmap64+48>    mov    r10d, r14d
       0x7ffff7b0e903 <mmap64+51>    mov    edx, r12d
       0x7ffff7b0e906 <mmap64+54>    mov    rsi, r13
       0x7ffff7b0e909 <mmap64+57>    mov    rdi, rbp
       0x7ffff7b0e90c <mmap64+60>    mov    eax, 9
       0x7ffff7b0e911 <mmap64+65>    syscall 
    

    C source code:

    位于 glibc 源码: sysdeps/unix/sysv/linux/mmap64.c

    via:https://code.woboq.org/userspace/glibc/sysdeps/unix/sysv/linux/mmap64.c.html#40

    /* To avoid silent truncation of offset when using mmap2, do not accept
       offset larger than 1 << (page_shift + off_t bits).  For archictures with
       32 bits off_t and page size of 4096 it would be 1^44.  */
    #define MMAP_OFF_HIGH_MASK 
      ((-(MMAP2_PAGE_UNIT << 1) << (8 * sizeof (off_t) - 1)))
    
    #define MMAP_OFF_MASK (MMAP_OFF_HIGH_MASK | MMAP_OFF_LOW_MASK)
    
    /* An architecture may override this.  */
    #ifndef MMAP_PREPARE
    # define MMAP_PREPARE(addr, len, prot, flags, fd, offset)
    #endif
    
    void *
    __mmap64 (void *addr, size_t len, int prot, int flags, int fd, off64_t offset)
    {
      MMAP_CHECK_PAGE_UNIT ();
    
      if (offset & MMAP_OFF_MASK)
        return (void *) INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
    
      MMAP_PREPARE (addr, len, prot, flags, fd, offset);
    #ifdef __NR_mmap2
      return (void *) MMAP_CALL (mmap2, addr, len, prot, flags, fd,
    			     (off_t) (offset / MMAP2_PAGE_UNIT));
    #else
      return (void *) MMAP_CALL (mmap, addr, len, prot, flags, fd, offset);
    #endif
    }
    weak_alias (__mmap64, mmap64)
    libc_hidden_def (__mmap64)
    

    在用户层就是那么多东西

    就是一个 9 号系统调用

    cat /usr/include/asm/unistd_64.h

    查到

    #define __NR_mmap 9
    

    现在追进内核

    9 号系统调用有 6 个参数(其实可以利用这个去查找因为是 6 个参数所以 搜索 SYSCALL_DEFINE6(mmap 这里的 6 就是描述参数 )

    然后找到:

    SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
    		unsigned long, prot, unsigned long, flags,
    		unsigned long, fd, unsigned long, pgoff)
    {
    	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
    }
    

    这个方法用 vscode 的人比较方便操作

    因为我是用 vim + gtags

    在内核里面找系统调用

    我用的是 5.4-rc4 的 kernel

    我查看 arch/x86/entry/syscalls/syscall_64.tbl

    9	common	mmap			__x64_sys_mmap
    

    看到是 sys_mmap

    直接跳到 sys_mmap 函数

    arch/ia64/kernel/sys_ia64.c

    via: https://elixir.bootlin.com/linux/v5.4-rc4/source/arch/ia64/kernel/sys_ia64.c#L149

    asmlinkage unsigned long
    sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff)
    {
    	addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
    	if (!IS_ERR((void *) addr))
    		force_successful_syscall_return();
    	return addr;
    }
    
    asmlinkage unsigned long
    sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off)
    {
    	if (offset_in_page(off) != 0)
    		return -EINVAL;
    
    	addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
    	if (!IS_ERR((void *) addr))
    		force_successful_syscall_return();
    	return addr;
    }
    

    跟下去:
    mm/mmap.c

    via:https://elixir.bootlin.com/linux/v5.4-rc4/source/mm/mmap.c#L1585

    unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
    			      unsigned long prot, unsigned long flags,
    			      unsigned long fd, unsigned long pgoff)
    {
    	struct file *file = NULL;
    	unsigned long retval;
    
    	addr = untagged_addr(addr);
    
        //判断是不是匿名映射
    	if (!(flags & MAP_ANONYMOUS)) {
    		audit_mmap_fd(fd, flags);
    		file = fget(fd);
    		if (!file)
    			return -EBADF;
    		if (is_file_hugepages(file))
    			len = ALIGN(len, huge_page_size(hstate_file(file)));
    		retval = -EINVAL;
    		if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
    			goto out_fput;
    	} else if (flags & MAP_HUGETLB) {
    		struct user_struct *user = NULL;
    		struct hstate *hs;
    
    		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
    		if (!hs)
    			return -EINVAL;
    
    		len = ALIGN(len, huge_page_size(hs));
    		/*
    		 * VM_NORESERVE is used because the reservations will be
    		 * taken when vm_ops->mmap() is called
    		 * A dummy user value is used because we are not locking
    		 * memory so no accounting is necessary
    		 */
    		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
    				VM_NORESERVE,
    				&user, HUGETLB_ANONHUGE_INODE,
    				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
    		if (IS_ERR(file))
    			return PTR_ERR(file);
    	}
    
    	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
    
    	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
    out_fput:
    	if (file)
    		fput(file);
    	return retval;
    }
    
    SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
    		unsigned long, prot, unsigned long, flags,
    		unsigned long, fd, unsigned long, pgoff)
    {
    	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
    }
    

    mm/util.c

    via:https://elixir.bootlin.com/linux/v5.4-rc4/source/mm/util.c#L483

    unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
    	unsigned long len, unsigned long prot,
    	unsigned long flag, unsigned long pgoff)
    {
    	unsigned long ret;
        //获得 描述 当前进程 内存 的 mm_struct
    	struct mm_struct *mm = current->mm;
    	unsigned long populate;
    	LIST_HEAD(uf);
    
    	ret = security_mmap_file(file, prot, flag);
        
    	if (!ret) {
    		if (down_write_killable(&mm->mmap_sem))
    			return -EINTR;
    		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
    				    &populate, &uf);
    		up_write(&mm->mmap_sem);
    		userfaultfd_unmap_complete(mm, &uf);
    		if (populate)
    			mm_populate(ret, populate);
    	}
    	return ret;
    }
    

    security_mmap_file

    via:https://elixir.bootlin.com/linux/v5.4-rc4/source/security/security.c#L1445

    int security_mmap_file(struct file *file, unsigned long prot,
    			unsigned long flags)
    {
    	int ret;
        //这个相当调用了 mmap_file 函数,这里的 call_int_hook 是这样定义的:
       /*
       #define call_int_hook(FUNC, IRC, ...) ({			
    	int RC = IRC;						
    	do {							
    		struct security_hook_list *P;			
    								
    		hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { 
    			RC = P->hook.FUNC(__VA_ARGS__);		
    			if (RC != 0)				
    				break;				
    		}						
    	} while (0);						
    	RC;							
    })
    首先是一个 security_hook_list 结构体
    在里面找到了 security_list_options 结构体
    在 security_list_options 里面有 mmap_file 函数指针
    hlist_for_each_entry  哈希表
    security_hook_heads 里面有	struct hlist_head mmap_file;
    看的我烦死,反正就是他妈的调用了mmap_file,__VA_ARGS__,把后面的参数(...)传给 mmap_file
    看代码比看我描述简单
    */
        
    	ret = call_int_hook(mmap_file, 0, file, prot,
    					mmap_prot(file, prot), flags);
    	if (ret)
    		return ret;
    	return ima_file_mmap(file, prot);
    }
    

    mmap_prot (用来处理 prot 参数)

    via:https://elixir.bootlin.com/linux/v5.4-rc4/source/security/security.c#L1412

    static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
    {
    	/*
    	 * Does we have PROT_READ and does the application expect
    	 * it to imply PROT_EXEC?  If not, nothing to talk about...
    	 */
        //自己看上面的英文注释
    	if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
    		return prot;
    	if (!(current->personality & READ_IMPLIES_EXEC))
    		return prot;
    	/*
    	 * if that's an anonymous mapping, let it.
    	 */
    	if (!file)
    		return prot | PROT_EXEC;
    	/*
    	 * ditto if it's not on noexec mount, except that on !MMU we need
    	 * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
    	 */
    	if (!path_noexec(&file->f_path)) {
    #ifndef CONFIG_MMU
    		if (file->f_op->mmap_capabilities) {
    			unsigned caps = file->f_op->mmap_capabilities(file);
    			if (!(caps & NOMMU_MAP_EXEC))
    				return prot;
    		}
    #endif
    		return prot | PROT_EXEC;
    	}
    	/* anything on noexec mount won't get PROT_EXEC */
    	return prot;
    }
    

    mmap_file

    via:https://elixir.bootlin.com/linux/v5.4-rc4/source/scripts/recordmcount.c#L158

    static void *mmap_file(char const *fname)
    {
    	/* Avoid problems if early cleanup() */
    	fd_map = -1;
    	mmap_failed = 1;
    	file_map = NULL;
    	file_ptr = NULL;
    	file_updated = 0;
    	sb.st_size = 0;
    
    	fd_map = open(fname, O_RDONLY);
    	if (fd_map < 0) {
    		perror(fname);
    		return NULL;
    	}
    	if (fstat(fd_map, &sb) < 0) {
    		perror(fname);
    		goto out;
    	}
    	if (!S_ISREG(sb.st_mode)) {
    		fprintf(stderr, "not a regular file: %s
    ", fname);
    		goto out;
    	}
    	file_map = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE,
    			fd_map, 0);
    	if (file_map == MAP_FAILED) {
    		mmap_failed = 1;
    		file_map = umalloc(sb.st_size);
    		if (!file_map) {
    			perror(fname);
    			goto out;
    		}
    		if (read(fd_map, file_map, sb.st_size) != sb.st_size) {
    			perror(fname);
    			free(file_map);
    			file_map = NULL;
    			goto out;
    		}
    	} else
    		mmap_failed = 0;
    out:
    	close(fd_map);
    	fd_map = -1;
    
    	file_end = file_map + sb.st_size;
    
    	return file_map;
    }
    

    主要的函数调用路线是这样的:

    vm_mmap_pgoff -> do_mmap_pgoff -> do_mmap -> mmap_region

    主要就是后面两个函数

    via:https://elixir.bootlin.com/linux/v5.4-rc4/source/mm/mmap.c#L1397

    unsigned long do_mmap(struct file *file, unsigned long addr,
    			unsigned long len, unsigned long prot,
    			unsigned long flags, vm_flags_t vm_flags,
    			unsigned long pgoff, unsigned long *populate,
    			struct list_head *uf)
    {
    	struct mm_struct *mm = current->mm;
    	int pkey = 0;
    
    	*populate = 0;
    
    	if (!len)
    		return -EINVAL;
    
    	/*
    	 * Does the application expect PROT_READ to imply PROT_EXEC?
    	 *
    	 * (the exception is when the underlying filesystem is noexec
    	 *  mounted, in which case we dont add PROT_EXEC.)
    	 */
        //判断权限,看上面的注释(懒得翻译)
    	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
    		if (!(file && path_noexec(&file->f_path)))
    			prot |= PROT_EXEC;
    
    	/* force arch specific MAP_FIXED handling in get_unmapped_area */
    	if (flags & MAP_FIXED_NOREPLACE)
    		flags |= MAP_FIXED;
    
    	if (!(flags & MAP_FIXED))
    		addr = round_hint_to_min(addr);
    
    	/* Careful about overflows.. */
        /*判断溢出*/
    	len = PAGE_ALIGN(len);
    	if (!len)
    		return -ENOMEM;
    
    	/* offset overflow? */
    	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
    		return -EOVERFLOW;
    
    	/* Too many mappings? */
    
    	if (mm->map_count > sysctl_max_map_count)
    		return -ENOMEM;
    
    	/* Obtain the address to map to. we verify (or select) it and ensure
    	 * that it represents a valid section of the address space.
    	 */
    	addr = get_unmapped_area(file, addr, len, pgoff, flags);
    	if (offset_in_page(addr))
    		return addr;
    
    	if (flags & MAP_FIXED_NOREPLACE) {
    		struct vm_area_struct *vma = find_vma(mm, addr);
    
    		if (vma && vma->vm_start < addr + len)
    			return -EEXIST;
    	}
    
    	if (prot == PROT_EXEC) {
    		pkey = execute_only_pkey(mm);
    		if (pkey < 0)
    			pkey = 0;
    	}
    
    	/* Do simple checking here so the lower-level routines won't have
    	 * to. we assume access permissions have been handled by the open
    	 * of the memory object, so we don't do any here.
    	 */
    	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
    			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
    
    	if (flags & MAP_LOCKED)
    		if (!can_do_mlock())
    			return -EPERM;
    
    	if (mlock_future_check(mm, vm_flags, len))
    		return -EAGAIN;
    
    	if (file) {
    		struct inode *inode = file_inode(file);
    		unsigned long flags_mask;
    
    		if (!file_mmap_ok(file, inode, pgoff, len))
    			return -EOVERFLOW;
    
    		flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
    
    		switch (flags & MAP_TYPE) {
    		case MAP_SHARED:
    			/*
    			 * Force use of MAP_SHARED_VALIDATE with non-legacy
    			 * flags. E.g. MAP_SYNC is dangerous to use with
    			 * MAP_SHARED as you don't know which consistency model
    			 * you will get. We silently ignore unsupported flags
    			 * with MAP_SHARED to preserve backward compatibility.
    			 */
    			flags &= LEGACY_MAP_MASK;
    			/* fall through */
    		case MAP_SHARED_VALIDATE:
    			if (flags & ~flags_mask)
    				return -EOPNOTSUPP;
    			if (prot & PROT_WRITE) {
    				if (!(file->f_mode & FMODE_WRITE))
    					return -EACCES;
    				if (IS_SWAPFILE(file->f_mapping->host))
    					return -ETXTBSY;
    			}
    
    			/*
    			 * Make sure we don't allow writing to an append-only
    			 * file..
    			 */
    			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
    				return -EACCES;
    
    			/*
    			 * Make sure there are no mandatory locks on the file.
    			 */
    			if (locks_verify_locked(file))
    				return -EAGAIN;
    
    			vm_flags |= VM_SHARED | VM_MAYSHARE;
    			if (!(file->f_mode & FMODE_WRITE))
    				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
    
    			/* fall through */
    		case MAP_PRIVATE:
    			if (!(file->f_mode & FMODE_READ))
    				return -EACCES;
    			if (path_noexec(&file->f_path)) {
    				if (vm_flags & VM_EXEC)
    					return -EPERM;
    				vm_flags &= ~VM_MAYEXEC;
    			}
    
    			if (!file->f_op->mmap)
    				return -ENODEV;
    			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
    				return -EINVAL;
    			break;
    
    		default:
    			return -EINVAL;
    		}
    	} else {
    		switch (flags & MAP_TYPE) {
    		case MAP_SHARED:
    			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
    				return -EINVAL;
    			/*
    			 * Ignore pgoff.
    			 */
    			pgoff = 0;
    			vm_flags |= VM_SHARED | VM_MAYSHARE;
    			break;
    		case MAP_PRIVATE:
    			/*
    			 * Set pgoff according to addr for anon_vma.
    			 */
    			pgoff = addr >> PAGE_SHIFT;
    			break;
    		default:
    			return -EINVAL;
    		}
    	}
    
    	/*
    	 * Set 'VM_NORESERVE' if we should not account for the
    	 * memory use of this mapping.
    	 */
    	if (flags & MAP_NORESERVE) {
    		/* We honor MAP_NORESERVE if allowed to overcommit */
    		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
    			vm_flags |= VM_NORESERVE;
    
    		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
    		if (file && is_file_hugepages(file))
    			vm_flags |= VM_NORESERVE;
    	}
    
    	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
    	if (!IS_ERR_VALUE(addr) &&
    	    ((vm_flags & VM_LOCKED) ||
    	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
    		*populate = len;
    	return addr;
    }
    

    mmap_region

    via:https://elixir.bootlin.com/linux/v5.4-rc4/source/mm/mmap.c#L1721

    unsigned long mmap_region(struct file *file, unsigned long addr,
    		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
    		struct list_head *uf)
    {
    	struct mm_struct *mm = current->mm;
    	struct vm_area_struct *vma, *prev;
    	int error;
    	struct rb_node **rb_link, *rb_parent;
    	unsigned long charged = 0;
    
    	/* Check against address space limit. */
    	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
    		unsigned long nr_pages;
    
    		/*
    		 * MAP_FIXED may remove pages of mappings that intersects with
    		 * requested mapping. Account for the pages it would unmap.
    		 */
    		nr_pages = count_vma_pages_range(mm, addr, addr + len);
    
    		if (!may_expand_vm(mm, vm_flags,
    					(len >> PAGE_SHIFT) - nr_pages))
    			return -ENOMEM;
    	}
    
    	/* Clear old maps */
    	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
    			      &rb_parent)) {
    		if (do_munmap(mm, addr, len, uf))
    			return -ENOMEM;
    	}
    
    	/*
    	 * Private writable mapping: check memory availability
    	 */
    	if (accountable_mapping(file, vm_flags)) {
    		charged = len >> PAGE_SHIFT;
    		if (security_vm_enough_memory_mm(mm, charged))
    			return -ENOMEM;
    		vm_flags |= VM_ACCOUNT;
    	}
    
    	/*
    	 * Can we just expand an old mapping?
    	 */
    	vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
    			NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
    	if (vma)
    		goto out;
    
    	/*
    	 * Determine the object being mapped and call the appropriate
    	 * specific mapper. the address has already been validated, but
    	 * not unmapped, but the maps are removed from the list.
    	 */
    	vma = vm_area_alloc(mm);
    	if (!vma) {
    		error = -ENOMEM;
    		goto unacct_error;
    	}
    
    	vma->vm_start = addr;
    	vma->vm_end = addr + len;
    	vma->vm_flags = vm_flags;
    	vma->vm_page_prot = vm_get_page_prot(vm_flags);
    	vma->vm_pgoff = pgoff;
    
    	if (file) {
    		if (vm_flags & VM_DENYWRITE) {
    			error = deny_write_access(file);
    			if (error)
    				goto free_vma;
    		}
    		if (vm_flags & VM_SHARED) {
    			error = mapping_map_writable(file->f_mapping);
    			if (error)
    				goto allow_write_and_free_vma;
    		}
    
    		/* ->mmap() can change vma->vm_file, but must guarantee that
    		 * vma_link() below can deny write-access if VM_DENYWRITE is set
    		 * and map writably if VM_SHARED is set. This usually means the
    		 * new file must not have been exposed to user-space, yet.
    		 */
    		vma->vm_file = get_file(file);
    		error = call_mmap(file, vma);
    		if (error)
    			goto unmap_and_free_vma;
    
    		/* Can addr have changed??
    		 *
    		 * Answer: Yes, several device drivers can do it in their
    		 *         f_op->mmap method. -DaveM
    		 * Bug: If addr is changed, prev, rb_link, rb_parent should
    		 *      be updated for vma_link()
    		 */
    		WARN_ON_ONCE(addr != vma->vm_start);
    
    		addr = vma->vm_start;
    		vm_flags = vma->vm_flags;
    	} else if (vm_flags & VM_SHARED) {
    		error = shmem_zero_setup(vma);
    		if (error)
    			goto free_vma;
    	} else {
    		vma_set_anonymous(vma);
    	}
    
    	vma_link(mm, vma, prev, rb_link, rb_parent);
    	/* Once vma denies write, undo our temporary denial count */
    	if (file) {
    		if (vm_flags & VM_SHARED)
    			mapping_unmap_writable(file->f_mapping);
    		if (vm_flags & VM_DENYWRITE)
    			allow_write_access(file);
    	}
    	file = vma->vm_file;
    out:
    	perf_event_mmap(vma);
    
    	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
    	if (vm_flags & VM_LOCKED) {
    		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
    					is_vm_hugetlb_page(vma) ||
    					vma == get_gate_vma(current->mm))
    			vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
    		else
    			mm->locked_vm += (len >> PAGE_SHIFT);
    	}
    
    	if (file)
    		uprobe_mmap(vma);
    
    	/*
    	 * New (or expanded) vma always get soft dirty status.
    	 * Otherwise user-space soft-dirty page tracker won't
    	 * be able to distinguish situation when vma area unmapped,
    	 * then new mapped in-place (which must be aimed as
    	 * a completely new data area).
    	 */
    	vma->vm_flags |= VM_SOFTDIRTY;
    
    	vma_set_page_prot(vma);
    
    	return addr;
    
    unmap_and_free_vma:
    	vma->vm_file = NULL;
    	fput(file);
    
    	/* Undo any partial mapping done by a device driver. */
    	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
    	charged = 0;
    	if (vm_flags & VM_SHARED)
    		mapping_unmap_writable(file->f_mapping);
    allow_write_and_free_vma:
    	if (vm_flags & VM_DENYWRITE)
    		allow_write_access(file);
    free_vma:
    	vm_area_free(vma);
    unacct_error:
    	if (charged)
    		vm_unacct_memory(charged);
    	return error;
    }
    

    这里涉及的面有点庞大,全部追踪下去的话还会涉及 文件 的部分。我就简单描述一下,其实 mmap 的最终 映射的内存是由 vm_area_struct 来描述,里面有两个指针:vm_start, vm_end,描述映射区的起始地址和结束地址

    	vma = vm_area_alloc(mm);
    	if (!vma) {
    		error = -ENOMEM;
    		goto unacct_error;
    	}
    
    	vma->vm_start = addr;
    	vma->vm_end = addr + len;
    	vma->vm_flags = vm_flags;
    	vma->vm_page_prot = vm_get_page_prot(vm_flags);
    	vma->vm_pgoff = pgoff;
    

    里面提到 current- > mm 这个 current 就是描述当先进程信息的 task_struct

  • 相关阅读:
    STM32下载程序后不能运行
    (转载)时序约束的基本方法
    PLL失锁的问题
    算法与硬件构架的关系
    构建低成本、高度可配置的桥接解决方案:在嵌入式设计中采用基于D-PHY的MIPI标准外设
    SOLDERMASK_TOP不显示
    Allegro brd文件更新封装及焊盘方法
    Cadence Allegro光绘文件生成技巧
    shape合并
    allegro 如何 敷铜(铺铜),并去掉敷铜岛
  • 原文地址:https://www.cnblogs.com/crybaby/p/12940181.html
Copyright © 2011-2022 走看看