read & write
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { struct fd f = fdget_pos(fd); ...... loff_t pos = file_pos_read(f.file); ret = vfs_read(f.file, buf, count, &pos); ...... } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { struct fd f = fdget_pos(fd); ...... loff_t pos = file_pos_read(f.file); ret = vfs_write(f.file, buf, count, &pos); ...... } // vfs_read->__vfs_read // vfs_write->__vfs_write ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { if (file->f_op->read) return file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) // 比如ext4_file_read_iter return new_sync_read(file, buf, count, pos); else return -EINVAL; } ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, loff_t *pos) { if (file->f_op->write) return file->f_op->write(file, p, count, pos); else if (file->f_op->write_iter) // 比如ext4_file_write_iter return new_sync_write(file, p, count, pos); else return -EINVAL; } const struct file_operations ext4_file_operations = { ...... .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, ...... } // ext4_file_read_iter -> generic_file_read_iter // ext4_file_write_iter -> __generic_file_write_iter ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { ...... if (iocb->ki_flags & IOCB_DIRECT) { ...... struct address_space *mapping = file->f_mapping; ...... retval = mapping->a_ops->direct_IO(iocb, iter); } ...... retval = generic_file_buffered_read(iocb, iter, retval); } ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { ...... if (iocb->ki_flags & IOCB_DIRECT) { ...... written = generic_file_direct_write(iocb, from); ...... } else { ...... written = generic_perform_write(file, from, iocb->ki_pos); ...... } }
generic_file_read_iter 和 __generic_file_write_iter 有相似的逻辑,就是要区分是否用缓存。
如果发现设置了 IOCB_DIRECT,则会调用 address_space 的 direct_IO 的函数,直接从硬盘读写数据。
我们在 mmap 映射文件到内存的时候讲过 address_space,它主要用于在内存映射的时候将文件和内存页产生关联。
同样,对于缓存来讲,也需要文件和内存页进行关联,这就要用到 address_space。address_space 的相关操作定义在 struct address_space_operations 结构中。
对于 ext4 文件系统来讲, address_space 的操作定义在 ext4_aops,direct_IO 对应的函数是 ext4_direct_IO。
ext4_direct_IO 最终会调用到 __blockdev_direct_IO->do_blockdev_direct_IO,这就跨过了缓存层,到了通用块层,最终到了文件系统的设备驱动层。
由于文件系统是块设备,所以这个调用的是 blockdev 相关的函数。
static const struct address_space_operations ext4_aops = { ...... .direct_IO = ext4_direct_IO, ...... }; /* * This is a library function for use by filesystem drivers. */ static inline ssize_t do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) {......}
带缓存的写入操作
ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; do { struct page *page; unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ // 1. 对于每一页,先调用 address_space 的 write_begin 做一些准备 status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); // 2. 将写入的内容从用户态拷贝到内核态的页中 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); flush_dcache_page(page); // 3. 调用 address_space 的 write_end 完成写操作 status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); pos += copied; written += copied; // 4. 看脏页是否太多,需要写回硬盘。所谓脏页,就是写入到缓存,但是还没有写入到硬盘的页面。 balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); } static const struct address_space_operations ext4_aops = { ...... .write_begin = ext4_write_begin, .write_end = ext4_write_end, ...... } struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { struct page *page; int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT; page = pagecache_get_page(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); if (page) wait_for_stable_page(page); return page; } struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ ...... }
第一步,对于 ext4 来讲,调用的是 ext4_write_begin。在 ext4_write_begin中,调用 ext4_journal_start 做日志相关的工作;调用 grab_cache_page_write_begin,来得到应该写入的缓存页,缓存页放在 radix 基数树里面。pagecache_get_page 就是根据 pgoff_t index 这个长整型,在这棵树里面查找缓存页,如果找不到就会创建一个缓存页。
第三步,调用 ext4_write_end 完成写入。这里面会调用 ext4_journal_stop 完成日志的写入,会调用 block_write_end->__block_commit_write->mark_buffer_dirty,将修改过的缓存标记为脏页。可以看出,其实所谓的完成写入,并没有真正写入硬盘,仅仅是写入缓存后,标记为脏页。
但是这里有一个问题,数据很危险,一旦宕机就没有了,所以需要一种机制,将写入的页面真正写到硬盘中,我们称为回写(Write Back)。
size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { // 1. 将分配好的页面映射到内核里面的一个虚拟地址 char *kaddr = kmap_atomic(page), *p = kaddr + offset; // 2. 将用户态的数据拷贝到内核态的页面的虚拟地址中 iterate_all_kinds(i, bytes, v, copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, v.bv_offset, v.bv_len), memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) ) // 3. 将内核里面的映射删除 kunmap_atomic(kaddr); return bytes; } /** * balance_dirty_pages_ratelimited - balance dirty memory state * @mapping: address_space which was dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; ...... if (unlikely(current->nr_dirtied >= ratelimit)) balance_dirty_pages(mapping, wb, current->nr_dirtied); ...... }
// 发现脏页的数目超过了规定的数目,就调用 balance_dirty_pages->wb_start_background_writeback,启动一个背后线程开始回写。 void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ wb_wakeup(wb); } static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_bh(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) mod_delayed_work(bdi_wq, &wb->dwork, 0); spin_unlock_bh(&wb->work_lock); } (_tflags) | TIMER_IRQSAFE); } while (0) // bdi_wq 是一个全局变量,所有回写的任务都挂在这个队列上
// mod_delayed_work 函数负责将一个回写任务 bdi_writeback 挂在这个队列上
/* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq;
/** * mod_delayed_work - modify delay of or queue a delayed work * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * bdi_writeback 以 dwork 的身份挂到bdi_wq,并设delay=0,即一刻不等,马上执行。
* * mod_delayed_work_on() on local CPU. */ static inline bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) {....} /* bdi 即 backing device info,用于描述后端存储相关的信息。
* 每个块设备都会有这样一个结构,并且在初始化块设备的时候,调用 bdi_init 初始化这个结构
* 在初始化 bdi 的时候,也会调用 wb_init 初始化 bdi_writeback。*/ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) { wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); // 初始化一个 timer定时器,到时间就执行 wb_workfn 这个函数。 INIT_DELAYED_WORK(&wb->dwork, wb_workfn); wb->dirty_sleep = jiffies; ...... } #define __INIT_DELAYED_WORK(_work, _func, _tflags) do { INIT_WORK(&(_work)->work, (_func)); __setup_timer(&(_work)->timer, delayed_work_timer_fn, (unsigned long)(_work),
接下来的调用链为:
wb_workfn->wb_do_writeback->wb_writeback->writeback_sb_inodes->__writeback_single_inode->do_writepages,写入页面到硬盘。
带缓存的读操作
static ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; for (;;) { struct page *page; pgoff_t end_index; loff_t isize; // 1. 先在page cache 里面找是否有缓存页 page = find_get_page(mapping, index); if (!page) { if (iocb->ki_flags & IOCB_NOWAIT) goto would_block; // 2. 如果没有找到,不但读取这一页,还要进行预读 page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); // 3. 预读完了以后,再试一把查找缓存页,应该能找到了 page = find_get_page(mapping, index); if (unlikely(page == NULL)) goto no_cached_page; } // 4. 如果第一次找缓存页就找到了,我们还是要判断,是不是应该继续预读 if (PageReadahead(page)) { // 5. 如果需要,就发起一个异步预读 page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); } /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... * 6. 将内容从内核缓存页拷贝到用户内存空间 */ ret = copy_page_to_iter(page, offset, nr, iter); } }