zoukankan      html  css  js  c++  java
  • read & write

    read & write

    SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
    {
      struct fd f = fdget_pos(fd);
      ......
      loff_t pos = file_pos_read(f.file);
      ret = vfs_read(f.file, buf, count, &pos);
      ......
    }
    
    SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
        size_t, count)
    {
      struct fd f = fdget_pos(fd);
      ......
      loff_t pos = file_pos_read(f.file);
        ret = vfs_write(f.file, buf, count, &pos);
      ......
    }
    
    // vfs_read->__vfs_read
    // vfs_write->__vfs_write
    
    ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
           loff_t *pos)
    {
      if (file->f_op->read)
        return file->f_op->read(file, buf, count, pos);
      else if (file->f_op->read_iter) // 比如ext4_file_read_iter
        return new_sync_read(file, buf, count, pos);
      else
        return -EINVAL;
    }
    
    ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
            loff_t *pos)
    {
      if (file->f_op->write)
        return file->f_op->write(file, p, count, pos);
      else if (file->f_op->write_iter) // 比如ext4_file_write_iter
        return new_sync_write(file, p, count, pos);
      else
        return -EINVAL;
    }
    
    const struct file_operations ext4_file_operations = {
      ......
      .read_iter  = ext4_file_read_iter,
      .write_iter  = ext4_file_write_iter,
      ......
    }
    
    // ext4_file_read_iter -> generic_file_read_iter
    // ext4_file_write_iter -> __generic_file_write_iter
    
    ssize_t
    generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
    {
        ......
        if (iocb->ki_flags & IOCB_DIRECT) {
        ......
            struct address_space *mapping = file->f_mapping;
        ......
            retval = mapping->a_ops->direct_IO(iocb, iter);
        }
        ......
        retval = generic_file_buffered_read(iocb, iter, retval);
    }
    
    ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
    {
        ......
        if (iocb->ki_flags & IOCB_DIRECT) {
        ......
            written = generic_file_direct_write(iocb, from);
        ......
        } else {
        ......
        written = generic_perform_write(file, from, iocb->ki_pos);
        ......
        }
    }

    generic_file_read_iter 和 __generic_file_write_iter 有相似的逻辑,就是要区分是否用缓存。

    如果发现设置了 IOCB_DIRECT,则会调用 address_space 的 direct_IO 的函数,直接从硬盘读写数据。

    我们在 mmap 映射文件到内存的时候讲过 address_space,它主要用于在内存映射的时候将文件和内存页产生关联。

    同样,对于缓存来讲,也需要文件和内存页进行关联,这就要用到 address_space。address_space 的相关操作定义在 struct address_space_operations 结构中。

    对于 ext4 文件系统来讲, address_space 的操作定义在 ext4_aops,direct_IO 对应的函数是 ext4_direct_IO。

    ext4_direct_IO 最终会调用到 __blockdev_direct_IO->do_blockdev_direct_IO,这就跨过了缓存层,到了通用块层,最终到了文件系统的设备驱动层。

    由于文件系统是块设备,所以这个调用的是 blockdev 相关的函数。

    static const struct address_space_operations ext4_aops = {
      ......
      .direct_IO    = ext4_direct_IO,
      ......
    };
    
    /*
     * This is a library function for use by filesystem drivers.
     */
    static inline ssize_t
    do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
              struct block_device *bdev, struct iov_iter *iter,
              get_block_t get_block, dio_iodone_t end_io,
              dio_submit_t submit_io, int flags)
    {......}

     

    带缓存的写入操作

    ssize_t generic_perform_write(struct file *file,
            struct iov_iter *i, loff_t pos)
    {
      struct address_space *mapping = file->f_mapping;
      const struct address_space_operations *a_ops = mapping->a_ops;
      do {
        struct page *page;
        unsigned long offset;  /* Offset into pagecache page */
        unsigned long bytes;  /* Bytes to write to page */
        // 1. 对于每一页,先调用 address_space 的 write_begin 做一些准备
        status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                &page, &fsdata);
        // 2. 将写入的内容从用户态拷贝到内核态的页中
        copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
        flush_dcache_page(page);
        // 3. 调用 address_space 的 write_end 完成写操作
        status = a_ops->write_end(file, mapping, pos, bytes, copied,
                page, fsdata);
        pos += copied;
    written += copied;
    
        // 4. 看脏页是否太多,需要写回硬盘。所谓脏页,就是写入到缓存,但是还没有写入到硬盘的页面。
        balance_dirty_pages_ratelimited(mapping);
      } while (iov_iter_count(i));
    }
    
    static const struct address_space_operations ext4_aops = {
    ......
      .write_begin    = ext4_write_begin,
      .write_end    = ext4_write_end,
    ......
    }
    
    struct page *grab_cache_page_write_begin(struct address_space *mapping,
              pgoff_t index, unsigned flags)
    {
      struct page *page;
      int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
      page = pagecache_get_page(mapping, index, fgp_flags,
          mapping_gfp_mask(mapping));
      if (page)
        wait_for_stable_page(page);
      return page;
    }
    
    struct address_space {
      struct inode    *host;    /* owner: inode, block_device */
      struct radix_tree_root  page_tree;  /* radix tree of all pages */
      spinlock_t    tree_lock;  /* and lock protecting it */
    ......
    }

    第一步,对于 ext4 来讲,调用的是 ext4_write_begin。在 ext4_write_begin中,调用 ext4_journal_start 做日志相关的工作;调用 grab_cache_page_write_begin,来得到应该写入的缓存页,缓存页放在 radix 基数树里面。pagecache_get_page 就是根据 pgoff_t index 这个长整型,在这棵树里面查找缓存页,如果找不到就会创建一个缓存页。

    第三步,调用 ext4_write_end 完成写入。这里面会调用 ext4_journal_stop 完成日志的写入,会调用 block_write_end->__block_commit_write->mark_buffer_dirty,将修改过的缓存标记为脏页。可以看出,其实所谓的完成写入,并没有真正写入硬盘,仅仅是写入缓存后,标记为脏页

    但是这里有一个问题,数据很危险,一旦宕机就没有了,所以需要一种机制,将写入的页面真正写到硬盘中,我们称为回写Write Back)。

    size_t iov_iter_copy_from_user_atomic(struct page *page,
        struct iov_iter *i, unsigned long offset, size_t bytes)
    {
      // 1. 将分配好的页面映射到内核里面的一个虚拟地址
      char *kaddr = kmap_atomic(page), *p = kaddr + offset;
      // 2. 将用户态的数据拷贝到内核态的页面的虚拟地址中
      iterate_all_kinds(i, bytes, v,
        copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
        memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
             v.bv_offset, v.bv_len),
        memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
      )
      // 3. 将内核里面的映射删除
      kunmap_atomic(kaddr);
      return bytes;
    }
    
    /**
     * balance_dirty_pages_ratelimited - balance dirty memory state
     * @mapping: address_space which was dirtied
     *
     * Processes which are dirtying memory should call in here once for each page
     * which was newly dirtied.  The function will periodically check the system's
     * dirty state and will initiate writeback if needed.
      */
    void balance_dirty_pages_ratelimited(struct address_space *mapping)
    {
      struct inode *inode = mapping->host;
      struct backing_dev_info *bdi = inode_to_bdi(inode);
      struct bdi_writeback *wb = NULL;
      int ratelimit;
    ......
      if (unlikely(current->nr_dirtied >= ratelimit))
        balance_dirty_pages(mapping, wb, current->nr_dirtied);
    ......
    }
    // 发现脏页的数目超过了规定的数目,就调用 balance_dirty_pages->wb_start_background_writeback,启动一个背后线程开始回写。 void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ wb_wakeup(wb); } static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_bh(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) mod_delayed_work(bdi_wq, &wb->dwork, 0); spin_unlock_bh(&wb->work_lock); } (_tflags) | TIMER_IRQSAFE); } while (0) // bdi_wq 是一个全局变量,所有回写的任务都挂在这个队列上
    // mod_delayed_work 函数负责将一个回写任务 bdi_writeback 挂在这个队列上
    /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq;
    /** * mod_delayed_work - modify delay of or queue a delayed work * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * bdi_writeback 以 dwork 的身份挂到bdi_wq,并设delay=0,即一刻不等,马上执行。
    * * mod_delayed_work_on() on local CPU.
    */ static inline bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) {....} /* bdi 即 backing device info,用于描述后端存储相关的信息。
    * 每个块设备都会有这样一个结构,并且在初始化块设备的时候,调用 bdi_init 初始化这个结构
    * 在初始化 bdi 的时候,也会调用 wb_init 初始化 bdi_writeback。
    */ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) { wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); // 初始化一个 timer定时器,到时间就执行 wb_workfn 这个函数。 INIT_DELAYED_WORK(&wb->dwork, wb_workfn); wb->dirty_sleep = jiffies; ...... } #define __INIT_DELAYED_WORK(_work, _func, _tflags) do { INIT_WORK(&(_work)->work, (_func)); __setup_timer(&(_work)->timer, delayed_work_timer_fn, (unsigned long)(_work),

    接下来的调用链为:

    wb_workfn->wb_do_writeback->wb_writeback->writeback_sb_inodes->__writeback_single_inode->do_writepages,写入页面到硬盘。

     

    带缓存的读操作

    static ssize_t generic_file_buffered_read(struct kiocb *iocb,
        struct iov_iter *iter, ssize_t written)
    {
      struct file *filp = iocb->ki_filp;
      struct address_space *mapping = filp->f_mapping;
      struct inode *inode = mapping->host;
      for (;;) {
        struct page *page;
        pgoff_t end_index;
        loff_t isize;
        // 1. 先在page cache 里面找是否有缓存页
        page = find_get_page(mapping, index);
        if (!page) {
          if (iocb->ki_flags & IOCB_NOWAIT)
            goto would_block;
          // 2. 如果没有找到,不但读取这一页,还要进行预读
          page_cache_sync_readahead(mapping,
              ra, filp,
              index, last_index - index);
          // 3. 预读完了以后,再试一把查找缓存页,应该能找到了
          page = find_get_page(mapping, index);
          if (unlikely(page == NULL))
            goto no_cached_page;
        }
        // 4. 如果第一次找缓存页就找到了,我们还是要判断,是不是应该继续预读
        if (PageReadahead(page)) {
          // 5. 如果需要,就发起一个异步预读
          page_cache_async_readahead(mapping,
              ra, filp, page,
              index, last_index - index);
        }
        /*
         * Ok, we have the page, and it's up-to-date, so
         * now we can copy it to user space...
         * 6. 将内容从内核缓存页拷贝到用户内存空间
         */
        ret = copy_page_to_iter(page, offset, nr, iter);
        }
    }
  • 相关阅读:
    leetcode 1301. 最大得分的路径数目
    LeetCode 1306 跳跃游戏 III Jump Game III
    LeetCode 1302. 层数最深叶子节点的和 Deepest Leaves Sum
    LeetCode 1300. 转变数组后最接近目标值的数组和 Sum of Mutated Array Closest to Target
    LeetCode 1299. 将每个元素替换为右侧最大元素 Replace Elements with Greatest Element on Right Side
    acwing 239. 奇偶游戏 并查集
    acwing 238. 银河英雄传说 并查集
    acwing 237程序自动分析 并查集
    算法问题实战策略 MATCHORDER 贪心
    Linux 安装Redis全过程日志
  • 原文地址:https://www.cnblogs.com/sunnycindy/p/14890961.html
Copyright © 2011-2022 走看看