zoukankan      html  css  js  c++  java
  • linux文件系统写过程简析

    linux写入磁盘过程经历VFS ->  页缓存(page cache) -> 具体的文件系统(ext2/3/4、XFS、ReiserFS等) -> Block IO ->设备驱动 -> SCSI指令(或者其他指令),总体来说linux文件写入磁盘过程比较复杂

    1、VFS(虚拟文件系统)

          Linux中采用了VFS的方式屏蔽了多个文件系统的差别, 当需要不同的设备或者其他文件系统时,采用挂载mount的方式访问其他设备或者其他文件系统(这里可以把文件系统理解为具体的设备)。正是因为使用了VFS,所以所有的文件系统设备使用统一的文件目录树视图访问,整个存储空间采用一个文件系统目录树来管理,屏蔽了底层多个文件系统之间的差别。当然,如果你需要把你自己编写的文件系统集成到Linux内核,采用VFS的方式进行访问,你需要采用模块加载的方式进行处理,相应的文件系统模块文件需要编入到系统目录/lib/modules/your-system-name/kernel/fs当中。当然VFS的作用远不止这些,通过VFS也进行访问设备,在Linux下所有的对象都是文件,简化了系统的访问。

         1.1 正常情况下,所有的文件操作通过系统调用进入到VFS中,特殊的处理,直接操作原始设备。文件系统写入的系统调用为:

      #include <unistd.h>

      ssize_t  write(int fd,  const void * buffer, size_t  count);

        1.2 当采用系统调用进入VFS时,接下来的处理交给VFS层。处理过程比较中要的是vfs_write、generic_file_aio_write

      

     1 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
     2 {
     3     ssize_t ret;
     4 
     5     if (!(file->f_mode & FMODE_WRITE))
     6         return -EBADF;
     7     if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
     8         return -EINVAL;
     9     if (unlikely(!access_ok(VERIFY_READ, buf, count)))
    10         return -EFAULT;
    11 
    12     ret = rw_verify_area(WRITE, file, pos, count);
    13     if (ret >= 0) {
    14         count = ret;
    15         if (file->f_op->write)
    16             ret = file->f_op->write(file, buf, count, pos);
    17         else
    18             ret = do_sync_write(file, buf, count, pos);
    19         if (ret > 0) {
    20             fsnotify_modify(file->f_path.dentry);
    21             add_wchar(current, ret);
    22         }
    23         inc_syscw(current);
    24     }
    25 
    26     return ret;
    27 }
     1 /**
     2  * generic_file_aio_write - write data to a file
     3  * @iocb:    IO state structure
     4  * @iov:    vector with data to write
     5  * @nr_segs:    number of segments in the vector
     6  * @pos:    position in file where to write
     7  *
     8  * This is a wrapper around __generic_file_aio_write() to be used by most
     9  * filesystems. It takes care of syncing the file in case of O_SYNC file
    10  * and acquires i_mutex as needed.
    11  */
    12 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
    13         unsigned long nr_segs, loff_t pos)
    14 {
    15     struct file *file = iocb->ki_filp;
    16     struct inode *inode = file->f_mapping->host;
    17     ssize_t ret;
    18 
    19     BUG_ON(iocb->ki_pos != pos);
    20 
    21     mutex_lock(&inode->i_mutex);
    22     ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
    23     mutex_unlock(&inode->i_mutex);
    24 
    25     if (ret > 0 || ret == -EIOCBQUEUED) {
    26         ssize_t err;
    27 
    28         err = generic_write_sync(file, pos, ret);
    29         if (err < 0 && ret > 0)
    30             ret = err;
    31     }
    32     return ret;
    33 }

      2、 对于VFS层也有采用page cache和非page cache两种,下面重要介绍采用page cache的处理。

          在VFS中, 每个打开操作的文件对应内核都有一个address_space 数据结构, 该数据结构是用来表示系统中打开的文件,并且一个打开的文件只有一个address_space数据结构。

    如下:   

     1 struct address_space {
     2     struct inode        *host;        /* owner: inode, block_device */
     3     struct radix_tree_root    page_tree;    /* radix tree of all pages */
     4     spinlock_t        tree_lock;    /* and lock protecting it */
     5     unsigned int        i_mmap_writable;/* count VM_SHARED mappings */
     6     struct prio_tree_root    i_mmap;        /* tree of private and shared mappings */
     7     struct list_head    i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
     8     spinlock_t        i_mmap_lock;    /* protect tree, count, list */
     9     unsigned int        truncate_count;    /* Cover race condition with truncate */
    10     unsigned long        nrpages;    /* number of total pages */
    11     pgoff_t            writeback_index;/* writeback starts here */
    12     const struct address_space_operations *a_ops;    /* methods */
    13     unsigned long        flags;        /* error bits/gfp mask */
    14     struct backing_dev_info *backing_dev_info; /* device readahead, etc */
    15     spinlock_t        private_lock;    /* for use by the address_space */
    16     struct list_head    private_list;    /* ditto */
    17     struct address_space    *assoc_mapping;    /* ditto */
    18     struct mutex        unmap_mutex;    /* to protect unmapping */
    19 } __attribute__((aligned(sizeof(long))));

        对于文件中的文件内容缓存采用的是基数树的方式来保存的,在成员变量page_tree中,关于基数树的介绍参考[1]和[2]。 下面是关于page cache写处理的几个重要的函数    

     1 ssize_t
     2 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
     3         unsigned long nr_segs, loff_t pos, loff_t *ppos,
     4         size_t count, ssize_t written)
     5 {
     6     struct file *file = iocb->ki_filp;
     7     struct address_space *mapping = file->f_mapping;
     8     ssize_t status;
     9     struct iov_iter i;
    10 
    11     iov_iter_init(&i, iov, nr_segs, count, written);
    12     status = generic_perform_write(file, &i, pos);
    13 
    14     if (likely(status >= 0)) {
    15         written += status;
    16         *ppos = pos + status;
    17       }
    18     
    19     /*
    20      * If we get here for O_DIRECT writes then we must have fallen through
    21      * to buffered writes (block instantiation inside i_size).  So we sync
    22      * the file data here, to try to honour O_DIRECT expectations.
    23      */
    24     if (unlikely(file->f_flags & O_DIRECT) && written)
    25         status = filemap_write_and_wait_range(mapping,
    26                     pos, pos + written - 1);
    27 
    28     return written ? written : status;
    29 }

        调用page cache中的write_begin 和write_end 

        Note: 在进行VFS系统调用写入文件过程中,可以允许在文件中的任何位置写入,这其中就包括当写入的过程中写入的起始位置不是一个block的开始位置,这时需要特殊的处理,上述的过程都在write_begin这个函数调用过程中处理完毕。

    3、ext2/3/4中文件的处理。

       当在page cache中进行到write_begin时,需要ext4中的ext4_write_begin处理, 如下:   

     1 static int ext4_write_begin(struct file *file, struct address_space *mapping,
     2                 loff_t pos, unsigned len, unsigned flags,
     3                 struct page **pagep, void **fsdata)
     4 {
     5     struct inode *inode = mapping->host;
     6     int ret, needed_blocks;
     7     handle_t *handle;
     8     int retries = 0;
     9     struct page *page;
    10     pgoff_t index;
    11     unsigned from, to;
    12         .........
    13 
    14     index = pos >> PAGE_CACHE_SHIFT;
    15     from = pos & (PAGE_CACHE_SIZE - 1);
    16     to = from + len;
    17 
    18 retry:
    19     handle = ext4_journal_start(inode, needed_blocks);
    20     if (IS_ERR(handle)) {
    21         ret = PTR_ERR(handle);
    22         goto out;
    23     }
    24 
    25     /* We cannot recurse into the filesystem as the transaction is already
    26      * started */
    27     flags |= AOP_FLAG_NOFS;
    28 
    29     page = grab_cache_page_write_begin(mapping, index, flags);
    30     if (!page) {
    31         ext4_journal_stop(handle);
    32         ret = -ENOMEM;
    33         goto out;
    34     }
    35     *pagep = page;
    36 
    37     ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
    38                 ext4_get_block);
    39 
    40     if (!ret && ext4_should_journal_data(inode)) {
    41         ret = walk_page_buffers(handle, page_buffers(page),
    42                 from, to, NULL, do_journal_get_write_access);
    43     }
    44 
    45     if (ret) {
    46         unlock_page(page);
    47         page_cache_release(page);
    48         /*
    49          * block_write_begin may have instantiated a few blocks
    50          * outside i_size.  Trim these off again. Don't need
    51          * i_size_read because we hold i_mutex.
    52          *
    53          * Add inode to orphan list in case we crash before
    54          * truncate finishes
    55          */
    56         if (pos + len > inode->i_size && ext4_can_truncate(inode))
    57             ext4_orphan_add(handle, inode);
    58 
    59         ext4_journal_stop(handle);
    60         if (pos + len > inode->i_size) {
    61             ext4_truncate_failed_write(inode);
    62             /*
    63              * If truncate failed early the inode might
    64              * still be on the orphan list; we need to
    65              * make sure the inode is removed from the
    66              * orphan list in that case.
    67              */
    68             if (inode->i_nlink)
    69                 ext4_orphan_del(NULL, inode);
    70         }
    71     }
    72 
    73     if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
    74         goto retry;
    75 out:
    76     return ret;
    77 }

           其中在ext4_write_begin中包含了很多的处理功能,包括文件物理块的分配(假设ext4中的delay allocation特性没有开启)、文件块的部分写过程的处理等。下面是在ext_write_begin函数调用过程中比较重要的几个函数。 

     1 /*
     2  * block_write_begin takes care of the basic task of block allocation and
     3  * bringing partial write blocks uptodate first.
     4  *
     5  * If *pagep is not NULL, then block_write_begin uses the locked page
     6  * at *pagep rather than allocating its own. In this case, the page will
     7  * not be unlocked or deallocated on failure.
     8  */
     9 int block_write_begin(struct file *file, struct address_space *mapping,
    10             loff_t pos, unsigned len, unsigned flags,
    11             struct page **pagep, void **fsdata,
    12             get_block_t *get_block)
    13 {
    14     struct inode *inode = mapping->host;
    15     int status = 0;
    16     struct page *page;
    17     pgoff_t index;
    18     unsigned start, end;
    19     int ownpage = 0;
    20 
    21     index = pos >> PAGE_CACHE_SHIFT;
    22     start = pos & (PAGE_CACHE_SIZE - 1);
    23     end = start + len;
    24 
    25     page = *pagep;
    26     if (page == NULL) {
    27         ownpage = 1;
    28         page = grab_cache_page_write_begin(mapping, index, flags);
    29         if (!page) {
    30             status = -ENOMEM;
    31             goto out;
    32         }
    33         *pagep = page;
    34     } else
    35         BUG_ON(!PageLocked(page));
    36 
    37     status = __block_prepare_write(inode, page, start, end, get_block);
    38     if (unlikely(status)) {
    39         ClearPageUptodate(page);
    40 
    41         if (ownpage) {
    42             unlock_page(page);
    43             page_cache_release(page);
    44             *pagep = NULL;
    45 
    46             /*
    47              * prepare_write() may have instantiated a few blocks
    48              * outside i_size.  Trim these off again. Don't need
    49              * i_size_read because we hold i_mutex.
    50              */
    51             if (pos + len > inode->i_size)
    52                 vmtruncate(inode, inode->i_size);
    53         }
    54     }
    55 
    56 out:
    57     return status;
    58 }

           

     1 static int __block_prepare_write(struct inode *inode, struct page *page,
     2         unsigned from, unsigned to, get_block_t *get_block)
     3 {
     4     unsigned block_start, block_end;
     5     sector_t block;
     6     int err = 0;
     7     unsigned blocksize, bbits;
     8     struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
     9 
    10     BUG_ON(!PageLocked(page));
    11     BUG_ON(from > PAGE_CACHE_SIZE);
    12     BUG_ON(to > PAGE_CACHE_SIZE);
    13     BUG_ON(from > to);
    14 
    15     blocksize = 1 << inode->i_blkbits;
    16     if (!page_has_buffers(page))
    17         create_empty_buffers(page, blocksize, 0);
    18     head = page_buffers(page);
    19 
    20     bbits = inode->i_blkbits;
    21     block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
    22 
    23     for(bh = head, block_start = 0; bh != head || !block_start;
    24         block++, block_start=block_end, bh = bh->b_this_page) {
    25         block_end = block_start + blocksize;
    26         if (block_end <= from || block_start >= to) {
    27             if (PageUptodate(page)) {
    28                 if (!buffer_uptodate(bh))
    29                     set_buffer_uptodate(bh);
    30             }
    31             continue;
    32         }
    33         if (buffer_new(bh))
    34             clear_buffer_new(bh);
    35         if (!buffer_mapped(bh)) {
    36             WARN_ON(bh->b_size != blocksize);
    37             err = get_block(inode, block, bh, 1);
    38             if (err)
    39                 break;
    40             if (buffer_new(bh)) {
    41                 unmap_underlying_metadata(bh->b_bdev,
    42                             bh->b_blocknr);
    43                 if (PageUptodate(page)) {
    44                     clear_buffer_new(bh);
    45                     set_buffer_uptodate(bh);
    46                     mark_buffer_dirty(bh);
    47                     continue;
    48                 }
    49                 if (block_end > to || block_start < from)
    50                     zero_user_segments(page,
    51                         to, block_end,
    52                         block_start, from);
    53                 continue;
    54             }
    55         }
    56         if (PageUptodate(page)) {
    57             if (!buffer_uptodate(bh))
    58                 set_buffer_uptodate(bh);
    59             continue; 
    60         }
    61         if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
    62             !buffer_unwritten(bh) &&
    63              (block_start < from || block_end > to)) {
    64             ll_rw_block(READ, 1, &bh);
    65             *wait_bh++=bh;
    66         }
    67     }
    68     /*
    69      * If we issued read requests - let them complete.
    70      */
    71     while(wait_bh > wait) {
    72         wait_on_buffer(*--wait_bh);
    73         if (!buffer_uptodate(*wait_bh))
    74             err = -EIO;
    75     }
    76     if (unlikely(err))
    77         page_zero_new_buffers(page, from, to);
    78     return err;
    79 }

         

     1 /**
     2  * ll_rw_block: low-level access to block devices (DEPRECATED)
     3  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
     4  * @nr: number of &struct buffer_heads in the array
     5  * @bhs: array of pointers to &struct buffer_head
     6  *
     7  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
     8  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
     9  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
    10  * are sent to disk. The fourth %READA option is described in the documentation
    11  * for generic_make_request() which ll_rw_block() calls.
    12  *
    13  * This function drops any buffer that it cannot get a lock on (with the
    14  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
    15  * clean when doing a write request, and any buffer that appears to be
    16  * up-to-date when doing read request.  Further it marks as clean buffers that
    17  * are processed for writing (the buffer cache won't assume that they are
    18  * actually clean until the buffer gets unlocked).
    19  *
    20  * ll_rw_block sets b_end_io to simple completion handler that marks
    21  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
    22  * any waiters. 
    23  *
    24  * All of the buffers must be for the same device, and must also be a
    25  * multiple of the current approved size for the device.
    26  */
    27 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
    28 {
    29     int i;
    30 
    31     for (i = 0; i < nr; i++) {
    32         struct buffer_head *bh = bhs[i];
    33 
    34         if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
    35             lock_buffer(bh);
    36         else if (!trylock_buffer(bh))
    37             continue;
    38 
    39         if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
    40             rw == SWRITE_SYNC_PLUG) {
    41             if (test_clear_buffer_dirty(bh)) {
    42                 bh->b_end_io = end_buffer_write_sync;
    43                 get_bh(bh);
    44                 if (rw == SWRITE_SYNC)
    45                     submit_bh(WRITE_SYNC, bh);
    46                 else
    47                     submit_bh(WRITE, bh);
    48                 continue;
    49             }
    50         } else {
    51             if (!buffer_uptodate(bh)) {
    52                 bh->b_end_io = end_buffer_read_sync;
    53                 get_bh(bh);
    54                 submit_bh(rw, bh);
    55                 continue;
    56             }
    57         }
    58         unlock_buffer(bh);
    59     }
    60 }

         其中在ext4中块的分配过程中,管理块分配处理的函数实现在fs/ext4/balloc.c  fs/ext4/mballoc.c

       4、当page cache中的数据需要刷新到disk上的时候,这时处理的过程由Block IO接管。

          在进行文件page cache刷新到disk上的过程中比较重要的数据结构有如下两个buffer_head 和 bio      

     1 struct buffer_head {
     2     unsigned long b_state;        /* buffer state bitmap (see above) */
     3     struct buffer_head *b_this_page;/* circular list of page's buffers */
     4     struct page *b_page;        /* the page this bh is mapped to */
     5 
     6     sector_t b_blocknr;        /* start block number */
     7     size_t b_size;            /* size of mapping */
     8     char *b_data;            /* pointer to data within the page */
     9 
    10     struct block_device *b_bdev;
    11     bh_end_io_t *b_end_io;        /* I/O completion */
    12      void *b_private;        /* reserved for b_end_io */
    13     struct list_head b_assoc_buffers; /* associated with another mapping */
    14     struct address_space *b_assoc_map;    /* mapping this buffer is
    15                            associated with */
    16     atomic_t b_count;        /* users using this buffer_head */
    17 };

       

     1 /*
     2  * main unit of I/O for the block layer and lower layers (ie drivers and
     3  * stacking drivers)
     4  */
     5 struct bio {
     6     sector_t        bi_sector;    /* device address in 512 byte
     7                            sectors */
     8     struct bio        *bi_next;    /* request queue link */
     9     struct block_device    *bi_bdev;
    10     unsigned long        bi_flags;    /* status, command, etc */
    11     unsigned long        bi_rw;        /* bottom bits READ/WRITE,
    12                          * top bits priority
    13                          */
    14 
    15     unsigned short        bi_vcnt;    /* how many bio_vec's */
    16     unsigned short        bi_idx;        /* current index into bvl_vec */
    17     ...............
    18 
    19     /*
    20      * We can inline a number of vecs at the end of the bio, to avoid
    21      * double allocations for a small number of bio_vecs. This member
    22      * MUST obviously be kept at the very end of the bio.
    23      */
    24     struct bio_vec        bi_inline_vecs[0];
    25 };

       在Block IO层进行基本的IO request的合并和处理调度, 基本的层由elevator管理, 具体的调度算法有noop、deadline和anticipate等多种调度算法,现在默认的调度算法是deadline,当然调度算法可调,根据系统可以调成系统最有的处理。 

    [1] 基数树(radix tree). http://blog.csdn.net/joker0910/article/details/8250085

    [2] Radix Tree. http://en.wikipedia.org/wiki/Radix_tree 

  • 相关阅读:
    计算机网络知识
    数据库知识
    操作系统知识
    计算机硬件基础知识
    计算机科学基础知识
    2019下半年软件设计师考试大纲
    软件设计师补题(2008下半年上午题)
    软件设计师补题(2008上半年上午题)
    测试复盘3
    测试复盘2
  • 原文地址:https://www.cnblogs.com/linghuchong0605/p/4515542.html
Copyright © 2011-2022 走看看