zoukankan      html  css  js  c++  java
  • Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)

    目录

    0. 引言
    1. open() syscall
    2. close() syscall

    0. 引言

    在linux的哲学中,所有的磁盘文件、目录、外设设备、驱动设备全部被抽象为了"文件"这个概念,所以本文提到的"File IO"适用于linux下所有的IO操作,需要明白的的,本文分析的是linux下的IO系统调用对应的内核源代码,linux下每一个系统调用都有对应的内核源代码,而我们在ring3常用的glib c的编程所有的c库API,它们只是对系统调用的一个封装,最终还是要通过系统调用实现功能

    0x1: SYSCALL_DEFINE宏定义

    我们在学习内核源代码的时候经常会遇到一个宏定义: SYSCALL_DEFINE,所有的系统调用的声明都通过它来实现

    linux-2.6.32.63includelinuxsyscalls.h

    #define SYSCALL_DEFINE0(sname)                    
        SYSCALL_TRACE_ENTER_EVENT(_##sname);            
        SYSCALL_TRACE_EXIT_EVENT(_##sname);            
        static const struct syscall_metadata __used        
          __attribute__((__aligned__(4)))            
          __attribute__((section("__syscalls_metadata")))    
          __syscall_meta_##sname = {                
            .name         = "sys_"#sname,            
            .nb_args     = 0,                
            .enter_event    = &event_enter__##sname,    
            .exit_event    = &event_exit__##sname,        
        };                            
        asmlinkage long sys_##sname(void)
    #else
        #define SYSCALL_DEFINE0(name)       asmlinkage long sys_##name(void)
    #endif
    
    #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
    #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
    #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
    #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
    #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
    #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

    ...

    #ifdef CONFIG_FTRACE_SYSCALLS
        #define SYSCALL_DEFINEx(x, sname, ...)                
            static const char *types_##sname[] = {            
                __SC_STR_TDECL##x(__VA_ARGS__)            
            };                            
            static const char *args_##sname[] = {            
                __SC_STR_ADECL##x(__VA_ARGS__)            
            };                            
            SYSCALL_METADATA(sname, x);                
            __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
    #else
        #define SYSCALL_DEFINEx(x, sname, ...)                
            __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
    #endif
    
    #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
        #define SYSCALL_DEFINE(name) static inline long SYSC_##name
        #define __SYSCALL_DEFINEx(x, name, ...)                    
        asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));        
        static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));    
        asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))        
        {                                
            __SC_TEST##x(__VA_ARGS__);                
            return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));    
        }                                
        SYSCALL_ALIAS(sys##name, SyS##name);                
        static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
    #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
        #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
        #define __SYSCALL_DEFINEx(x, name, ...) asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
    #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */

    所以对函数定义

    SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)就等于
    asmlinkage long sys_socket(int family, int type, int protocol)

    Relevant Link:

    http://blog.csdn.net/p_panyuch/article/details/5648007

    1. open() syscall

    open()系统调用在kernel中对应的是sys_open()

    linux-2.6.32.63fsopen.c

    SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
    {
        long ret;
    
        if (force_o_largefile())
        {
            flags |= O_LARGEFILE;
        } 
    
        //调用do_sys_open完成实际功能
        ret = do_sys_open(AT_FDCWD, filename, flags, mode);
        /* avoid REGPARM breakage on x86: */
        asmlinkage_protect(3, ret, filename, flags, mode);
        return ret;
    }

    继续跟进do_sys_open()函数

    long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
    {
        /*获取文件名称,由getname()函数完成,其内部首先创建存取文件名称的空间,然后从用户空间把文件名拷贝过来*/
        char *tmp = getname(filename);
        int fd = PTR_ERR(tmp);
    
        if (!IS_ERR(tmp)) 
        {
            /*获取一个可用的fd,此函数调用alloc_fd()函数从fd_table中获取一个可用fd,并进行初始化*/
            fd = get_unused_fd_flags(flags);
            if (fd >= 0) 
            {
                /*fd获取成功则开始打开文件,此函数是主要完成打开功能的函数*/
                struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
                if (IS_ERR(f)) 
                {
                    /*打开失败,释放fd*/
                    put_unused_fd(fd);
                    fd = PTR_ERR(f);
                } 
                else 
                {
                    //文件如果已经被打开了,调用fsnotify_open()函数 
                    fsnotify_open(f->f_path.dentry);
                    //将文件指针安装在fd数组中,每个进程都会将打开的文件句柄保存在fd_array[]数组中
                    fd_install(fd, f);
                }
            }
            //释放放置从用户空间拷贝过来的文件名的存储空间 
            putname(tmp);
        }
        return fd;
    }

    继续跟进do_file_open()函数

    /*
     * Note that the low bits of the passed in "open_flag"
     * are not the same as in the local variable "flag". See
     * open_to_namei_flags() for more details.
     */
    struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode)
    {
        /* 若干变量声明 */
        struct file *filp;
        struct nameidata nd;
        int error;
        struct path path;
        struct dentry *dir;
        int count = 0;
        int will_write;
        /*改变参数flag的值,具体做法是flag+1*/
        int flag = open_to_namei_flags(open_flag);
        /*设置访问权限*/
        if (!acc_mode)
        {
            acc_mode = MAY_OPEN | ACC_MODE(flag);
        } 
    
        /* O_TRUNC implies we need access checks for write permissions */
        /* 根据O_TRUNC标志设置写权限 */
        if (flag & O_TRUNC)
        {
            acc_mode |= MAY_WRITE;
        } 
    
        /* Allow the LSM permission hook to distinguish append access from general write access. */
        /* 设置O_APPEND标志 */
        if (flag & O_APPEND)
        {
            acc_mode |= MAY_APPEND;
        } 
    
        /* The simplest case - just a plain lookup. */
        /* 如果不是创建文件 */
        if (!(flag & O_CREAT)) 
        { 
            /*
            当内核要访问一个文件的时候,第一步要做的是找到这个文件,而查找文件的过程在vfs里面是由path_lookup或者path_lookup_open函数来完成的
            这两个函数将用户传进来的字符串表示的文件路径转换成一个dentry结构,并建立好相应的inode和file结构,将指向file的描述符返回用户
            用户随后通过文件描述符,来访问这些数据结构
            */
            error = path_lookup_open(dfd, pathname, lookup_flags(flag), &nd, flag);
            if (error)
            {
                return ERR_PTR(error);
            } 
            goto ok;
        }
    
        /*
         * Create - we need to know the parent.
         */
        //path-init为查找作准备工作,path_walk真正上路查找,这两个函数联合起来根据一段路径名找到对应的dentry  
        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
        if (error)
        {
            return ERR_PTR(error);
        } 
        /*
        这个函数相当重要,是整个NFS的名字解析函数,其实也是NFS得以构筑的函数
        该函数采用一个for循环,对name路径根据目录的层次,一层一层推进,直到终点或失败。在推进的过程中,一步步建立了目录树的dentry和对应的inode
        */
        error = path_walk(pathname, &nd);
        if (error) 
        {
            if (nd.root.mnt)
            {
                /*减少dentry和vsmount得计数*/
                path_put(&nd.root);
            } 
            return ERR_PTR(error);
        }
        if (unlikely(!audit_dummy_context()))
        {
            /*保存inode节点信息*/
            audit_inode(pathname, nd.path.dentry);
        } 
    
        /*
         * We have the parent and last component. First of all, check
         * that we are not asked to creat(2) an obvious directory - that
         * will not do.
         */
        error = -EISDIR;
        /*父节点信息*/
        if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
        {
            goto exit_parent;
        } 
    
        error = -ENFILE;
        /* 返回特定的file结构体指针 */
        filp = get_empty_filp();
        if (filp == NULL)
        {
            goto exit_parent;
        } 
        /* 填充nameidata结构 */
        nd.intent.open.file = filp;
        nd.intent.open.flags = flag;
        nd.intent.open.create_mode = mode;
        dir = nd.path.dentry;
        nd.flags &= ~LOOKUP_PARENT;
        nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
        if (flag & O_EXCL)
        {
            nd.flags |= LOOKUP_EXCL;
        } 
        mutex_lock(&dir->d_inode->i_mutex);
        /*从哈希表中查找nd对应的dentry*/
        path.dentry = lookup_hash(&nd);
        path.mnt = nd.path.mnt;
    
    do_last:
        error = PTR_ERR(path.dentry);
        if (IS_ERR(path.dentry)) 
        {
            mutex_unlock(&dir->d_inode->i_mutex);
            goto exit;
        }
    
        if (IS_ERR(nd.intent.open.file)) 
        {
            error = PTR_ERR(nd.intent.open.file);
            goto exit_mutex_unlock;
        }
    
        /* Negative dentry, just create the file */
        /*如果此dentry结构没有对应的inode节点,说明是无效的,应该创建文件节点 */
        if (!path.dentry->d_inode) 
        {
            /*
             * This write is needed to ensure that a
             * ro->rw transition does not occur between
             * the time when the file is created and when
             * a permanent write count is taken through
             * the 'struct file' in nameidata_to_filp().
            */
            /*write权限是必需的*/
            error = mnt_want_write(nd.path.mnt);
            if (error)
            {
                goto exit_mutex_unlock;
            } 
            /*按照namei格式的flag open*/
            error = __open_namei_create(&nd, &path, flag, mode);
            if (error) 
            {
                mnt_drop_write(nd.path.mnt);
                goto exit;
            }
            /*根据nameidata 得到相应的file结构*/
            filp = nameidata_to_filp(&nd, open_flag);
            if (IS_ERR(filp))
            {
                ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
            } 
            /*放弃写权限*/
            mnt_drop_write(nd.path.mnt);
            if (nd.root.mnt)
            {
                /*计数减一*/
                path_put(&nd.root);
            } 
            return filp;
        }
    
        /*
         * It already exists.
         */
        /*要打开的文件已经存在*/
        mutex_unlock(&dir->d_inode->i_mutex);
        /*保存inode节点*/
        audit_inode(pathname, path.dentry);
    
        error = -EEXIST;
        /*flag标志检查代码*/
        if (flag & O_EXCL)
        {
            goto exit_dput;
        } 
    
        if (__follow_mount(&path))
        {
            error = -ELOOP;
            if (flag & O_NOFOLLOW)
            {
                goto exit_dput;
            } 
        }
    
        error = -ENOENT;
        if (!path.dentry->d_inode)
        {
            goto exit_dput;
        } 
        if (path.dentry->d_inode->i_op->follow_link)
        {
            goto do_link;
        } 
        /*路径装化为相应的nameidata结构*/
        path_to_nameidata(&path, &nd);
        error = -EISDIR;
        /*如果是文件夹*/
        if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
        {
            goto exit;
        } 
    ok:
        /*
         * Consider:
         * 1. may_open() truncates a file
         * 2. a rw->ro mount transition occurs
         * 3. nameidata_to_filp() fails due to
         *    the ro mount.
         * That would be inconsistent, and should
         * be avoided. Taking this mnt write here
         * ensures that (2) can not occur.
         */
        /*检测是否截断文件标志*/
        will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
        if (will_write) 
        {
            /*要截断的话就要获取写权限*/
            error = mnt_want_write(nd.path.mnt);
            if (error)
            {
                goto exit;
            } 
        }
        //may_open执行权限检测、文件打开和truncate的操作
        error = may_open(&nd.path, acc_mode, flag);
        if (error) 
        {
            if (will_write)
            {
                mnt_drop_write(nd.path.mnt);
            } 
            goto exit;
        }
        filp = nameidata_to_filp(&nd, open_flag);
        if (IS_ERR(filp))
        {
            ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
        }
            
        /*
         * It is now safe to drop the mnt write
         * because the filp has had a write taken
         * on its behalf.
         */
        //安全的放弃写权限
        if (will_write)
        {
            mnt_drop_write(nd.path.mnt);
        } 
        if (nd.root.mnt)
        {
            path_put(&nd.root);
        } 
        return filp;
    
    exit_mutex_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
    exit_dput:
        path_put_conditional(&path, &nd);
    exit:
        if (!IS_ERR(nd.intent.open.file))
        {
            release_open_intent(&nd);
        }
            
    exit_parent:
        if (nd.root.mnt)
        {
            path_put(&nd.root);
        } 
        path_put(&nd.path);
        return ERR_PTR(error);
    
    do_link:
    //允许遍历连接文件,则手工找到连接文件对应的文件
        error = -ELOOP;
        if (flag & O_NOFOLLOW)
        {
            //不允许遍历连接文件,返回错误
            goto exit_dput;
        } 
        /*
         * This is subtle. Instead of calling do_follow_link() we do the
         * thing by hands. The reason is that this way we have zero link_count
         * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
         * After that we have the parent and last component, i.e.
         * we are in the same situation as after the first path_walk().
         * Well, almost - if the last component is normal we get its copy
         * stored in nd->last.name and we will have to putname() it when we
         * are done. Procfs-like symlinks just set LAST_BIND.
         */
        /* 以下是手工找到链接文件对应的文件dentry结构代码 */
    
        //设置查找LOOKUP_PARENT标志
        nd.flags |= LOOKUP_PARENT;
        //判断操作是否安全
        error = security_inode_follow_link(path.dentry, &nd);
        if (error)
        {
            goto exit_dput;
        } 
        //处理符号链接
        error = __do_follow_link(&path, &nd);
        if (error) 
        {
            /* Does someone understand code flow here? Or it is only
             * me so stupid? Anathema to whoever designed this non-sense
             * with "intent.open".
             */
            release_open_intent(&nd);
            if (nd.root.mnt)
            {
                path_put(&nd.root);
            } 
            return ERR_PTR(error);
        }
        nd.flags &= ~LOOKUP_PARENT;
        //检查最后一段文件或目录名的属性情况
        if (nd.last_type == LAST_BIND)
        {
            goto ok;
        } 
        error = -EISDIR;
        if (nd.last_type != LAST_NORM)
        {
            goto exit;
        } 
        if (nd.last.name[nd.last.len]) 
        {
            __putname(nd.last.name);
            goto exit;
        }
        error = -ELOOP;
        //出现回环标志: 循环超过32次
        if (count++==32) 
        {
            __putname(nd.last.name);
            goto exit;
        }
        dir = nd.path.dentry;
        mutex_lock(&dir->d_inode->i_mutex);
        //更新路径的挂接点和dentry
        path.dentry = lookup_hash(&nd);
        path.mnt = nd.path.mnt;
        __putname(nd.last.name);
        goto do_last;
    }

    总结一下流程

    1. open系统调用访问SYSCALL_DEFINE3函数
    2. 在open系统调用中,调用do_sys_open函数完成主要功能
    3. 在do_sys_open函数中,调用函数do_filp_open完成主要的打开功能
    4. 在内核中要打开一个文件,首先应该找到这个文件,而查找文件的过程在vfs里面是由do_path_lookup或者path_lookup_open函数来完成的
        4.1 设置nd->root=根路径(绝对地址)或者当前工作目录(相对地址)
        4.2 这一步做完了后,内核会建立一些数据结构(dentry,inode)来初始化查找的起点
        if(!retval){ retval = path_walk(name,nd);}
        4.3 path_walk会遍历路径的每一节点分量,也就是用"/"分隔开的每一部分,最终找到name指向的文件 
        int path_walk(const char *name,struct nameidata *nd)
        {
            return link_path_walk(name,nd);
            //path_walk其实相当于直接调用link_path_walk来完成工作
        }
        4.4 link_path_walk的主要工作是有其内部函数__link_path_walk 来完成的
            result = __link_path_walk(name,nd)
        4.5 __link_walk_path,该函数把传进来的字符串name,也就是用户指定的路径,按路径分隔符分解成一系列小的component。比如用户说,我要找"/path/to/dest"这个文件,那么我们的文件系统就会按path、to、dest一个
    一个来找,知道最后一个分量是文件或者查找完成。他找的时候,会先用path_init初始化过的根路径去找第一个分量,也就是path。然后用path的dentry->d_inode去找to,这样循环到最后一个。注意,内核会缓存找到的路径分量,
    所以往往只有第一次访问一个路径的时候,才会去访问磁盘,后面的访问会直接从缓存里找,下面会看到,很多与页告诉缓存打交道的代码。但不管怎样,第一遍查找总是会访问磁盘的
    static int __link_path_walk(const char *name,strucy nameidata *nd){..} 至此,按照每一个component查找完成之后,就会找到相应的文件,然后相应的打开工作就基本完成了

    Relevant Link:

    http://oss.org.cn/kernel-book/
    http://blog.csdn.net/f413933206/article/details/5701913

    2. close() syscall

    close()系统调用对应内核中的函数为: sys_close()

    linux-2.6.32.63fsopen.c

    /*
     * Careful here! We test whether the file pointer is NULL before
     * releasing the fd. This ensures that one clone task can't release
     * an fd while another clone is opening it.
     */
    SYSCALL_DEFINE1(close, unsigned int, fd)
    {
        struct file * filp;
        struct files_struct *files = current->files;
        struct fdtable *fdt;
        int retval;
    
        spin_lock(&files->file_lock);
        /*
        获取指向struct fdtable结构体的指针
        linux-2.6.32.63includelinuxfdtable.h
        #define files_fdtable(files) (rcu_dereference((files)->fdt))
        */
        fdt = files_fdtable(files);
        if (fd >= fdt->max_fds)
        {
            goto out_unlock;
        } 
        //获取需要关闭的文件描述符编号
        filp = fdt->fd[fd];
        if (!filp)
        {
            goto out_unlock;
        } 
        /*
        将fd_array[]中的的指定元素值置null 
        */
        rcu_assign_pointer(fdt->fd[fd], NULL);
        FD_CLR(fd, fdt->close_on_exec); 
        /*
        调用__put_unused_fd函数,将当前fd回收,则下一次打开新的文件又可以用这个fd了
        static void __put_unused_fd(struct files_struct *files, unsigned int fd)
        {
            struct fdtable *fdt = files_fdtable(files);
            __FD_CLR(fd, fdt->open_fds);
            if (fd < files->next_fd)
            {
                files->next_fd = fd;
            } 
        }
        */
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
        retval = filp_close(filp, files);
    
        /* can't restart close syscall because file table entry was cleared */
        if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK))
        {
            retval = -EINTR;
        } 
    
        return retval;
    
    out_unlock:
        spin_unlock(&files->file_lock);
        return -EBADF;
    }
    EXPORT_SYMBOL(sys_close);

    对于,我们需要重点跟进2个函数: rcu_assign_pointer(fdt->fd[fd], NULL);、retval = filp_close(filp, files);

    linux-2.6.32.63fs cupdate.h

    /**
     * rcu_assign_pointer - assign (publicize) a pointer to a newly
     * initialized structure that will be dereferenced by RCU read-side
     * critical sections.  Returns the value assigned.
     *
     * Inserts memory barriers on architectures that require them
     * (pretty much all of them other than x86), and also prevents
     * the compiler from reordering the code that initializes the
     * structure after the pointer assignment.  More importantly, this
     * call documents which pointers will be dereferenced by RCU read-side
     * code.
     */
    
    #define rcu_assign_pointer(p, v) 
        ({ 
            if (!__builtin_constant_p(v) || 
                ((v) != NULL)) 
                smp_wmb(); 
            (p) = (v); 
        })

    我们知道,每个进程在kernel中都有一个对应的task_struct与之对应,而通过task_struct可以间接地获得一个fd_array[]数组,表示当前进程已经打开的文件,每一个元素都是一个文件描述符的值,只有通过这个fd_array[x]才能获取当前进程打开的文件的struc file*,而rcu_assign_pointer(fdt->fd[fd], NULL)的作用就在于将将这个数组的指定元素置空,即断开了这个引用的关系,至于之后内核栈中的那个struct file*是否释放,那内存回收的事,至少现在进程想通过task_stuct是无法再引用到之前打开过的文件了,这里面的关系图可以参阅:

    http://www.cnblogs.com/LittleHann/p/3865490.html
    //搜索: 用一张图表示task_struct、fs_struct、files_struct、fdtable、file的关系

    我们继续分析etval = filp_close(filp, files);

    linux-2.6.32.63fsopen.c

    /*
     * "id" is the POSIX thread ID. We use the
     * files pointer for this..
     */
    int filp_close(struct file *filp, fl_owner_t id)
    {
        int retval = 0;
    
        if (!file_count(filp)) 
        {
            printk(KERN_ERR "VFS: Close: file count is 0
    ");
            return 0;
        }
    
        if (filp->f_op && filp->f_op->flush)
        {
            retval = filp->f_op->flush(filp, id);
        } 
    
        dnotify_flush(filp, id);
        locks_remove_posix(filp, id);
        fput(filp);
        return retval;
    }

    filp_close()负责将表示打开的文件的struct file*内存空间进行释放,至此,内核栈中就再也没有之前打开过的文件的任何痕迹了

    Relevant Link:

    http://blog.csdn.net/ce123_zhouwei/article/details/8459794

    Copyright (c) 2014 LittleHann All rights reserved

  • 相关阅读:
    内存溢出
    接手新业务
    pjb fabu
    中文手册
    人背的时候,做啥都失败
    帮助开发人员学习
    python中的__dict__,__getattr__,__setattr__
    NetCore在Docker中发布及运行
    ELK基础配置
    IdentityServer4 手动验签及日志记录
  • 原文地址:https://www.cnblogs.com/LittleHann/p/3932624.html
Copyright © 2011-2022 走看看