Ext3文件读写流程概述

zoukankan html css js c++ java

Ext3文件读写流程概述
Ext3文件读写流程概述

Ext3文件系统在进行读写操作的时候，首先需要open相应的文件，然后再进行读写操作。在open操作时，Linux kernel会创建一个file对象描述这个文件。File对象和文件的dentry和inode对象建立联系，并且将ext3的文件操作方法、映射处理方法（address space）注册到file对象中。

Ext3文件读写过程会涉及到VFS层的page cache，并且通常的读写操作都会使用到这层page cache，目的是提高磁盘的IO性能。在Linux中后台会运行writeback线程定时同步pagecache和设备之间的数据。Page cache的方式虽然能够提高IO性能，但是也对数据的安全性带来了潜在影响。

本文的目的是分析ext3文件系统读写流程中的关键函数，对于page cache原理以及writeback机制将在后继文章中做深入分析。

关键数据结构

File数据结构是Linux用来描述文件的关键数据结构，该对象在一个文件被进程打开的时候被创建。当一个文件被关闭的时候，file对象也会被立即销毁。file数据结构不会被作为元数据信息持久化保存至设备。该数据结构定义如下：
1. struct file {
2. /*
3. * fu_list becomes invalid after file_free is called and queued via
4. * fu_rcuhead for RCU freeing
5. */
6. union {
7. struct list_head fu_list;
8. struct rcu_head fu_rcuhead;
9. } f_u;
10. struct path f_path; /* 文件路径，包含文件dentry目录项和vfsmount信息 */
11. #define f_dentry f_path.dentry
12. #define f_vfsmnt f_path.mnt
13. const struct file_operations *f_op; /* 文件操作函数集 */
15. /*
16. * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
17. * Must not be taken from IRQ context.
18. */
19. spinlock_t f_lock;
20. #ifdef CONFIG_SMP
21. int f_sb_list_cpu;
22. #endif
23. atomic_long_t f_count;
24. unsigned int f_flags;
25. fmode_t f_mode; /* 文件操作模式 */
26. loff_t f_pos;
27. struct fown_struct f_owner;
28. const struct cred *f_cred;
29. struct file_ra_state f_ra;
31. u64 f_version;
32. #ifdef CONFIG_SECURITY
33. void *f_security;
34. #endif
35. /* needed for tty driver, and maybe others */
36. void *private_data;
38. #ifdef CONFIG_EPOLL
39. /* Used by fs/eventpoll.c to link all the hooks to this file */
40. struct list_head f_ep_links;
41. struct list_head f_tfile_llink;
42. #endif /* #ifdef CONFIG_EPOLL */
43. struct address_space *f_mapping; /* address space映射信息，指向inode中的i_mapping */
44. #ifdef CONFIG_DEBUG_WRITECOUNT
45. unsigned long f_mnt_write_state;
46. #endif
47. };
每个文件在内存中都会对应一个inode对象。在设备上也会保存每个文件的inode元数据信息，通过inode元数据信息可以找到该文件所占用的所有文件数据块（block）。VFS定义了一个通用的inode数据结构，同时ext3定义了ext3_inode元数据结构。在创建内存inode对象时，需要采用ext3_inode元数据信息初始化inode对象。Inode数据结构定义如下：
1. struct inode {
2. umode_t i_mode;
3. unsigned short i_opflags;
4. uid_t i_uid;
5. gid_t i_gid;
6. unsigned int i_flags;
8. #ifdef CONFIG_FS_POSIX_ACL
9. struct posix_acl *i_acl;
10. struct posix_acl *i_default_acl;
11. #endif
13. const struct inode_operations *i_op; /* inode操作函数集 */
14. struct super_block *i_sb; /* 指向superblock */
15. struct address_space *i_mapping; /* 指向当前使用的页缓存的映射信息 */
17. #ifdef CONFIG_SECURITY
18. void *i_security;
19. #endif
21. /* Stat data, not accessed from path walking */
22. unsigned long i_ino;
23. /*
24. * Filesystems may only read i_nlink directly. They shall use the
25. * following functions for modification:
26. *
27. * (set|clear|inc|drop)_nlink
28. * inode_(inc|dec)_link_count
29. */
30. union {
31. const unsigned int i_nlink;
32. unsigned int __i_nlink;
33. };
34. dev_t i_rdev; /* 设备号，major&minor */
35. struct timespec i_atime;
36. struct timespec i_mtime;
37. struct timespec i_ctime;
38. spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
39. unsigned short i_bytes;
40. blkcnt_t i_blocks; /* 文件块数量 */
41. loff_t i_size;
43. #ifdef __NEED_I_SIZE_ORDERED
44. seqcount_t i_size_seqcount;
45. #endif
47. /* Misc */
48. unsigned long i_state;
49. struct mutex i_mutex;
51. unsigned long dirtied_when; /* jiffies of first dirtying */
53. struct hlist_node i_hash; /* 连接到inode Hash Table中 */
54. struct list_head i_wb_list; /* backing dev IO list */
55. struct list_head i_lru; /* inode LRU list */
56. struct list_head i_sb_list;
57. union {
58. struct list_head i_dentry;
59. struct rcu_head i_rcu;
60. };
61. atomic_t i_count;
62. unsigned int i_blkbits; /* 块大小，通常磁盘块大小为512字节，因此i_blkbits为9 */
63. u64 i_version;
64. atomic_t i_dio_count;
65. atomic_t i_writecount;
66. const struct file_operations *i_fop; /* former ->i_op->default_file_ops，文件操作函数集 */
67. struct file_lock *i_flock;
68. struct address_space i_data; /* 页高速缓存映射信息 */
69. #ifdef CONFIG_QUOTA
70. struct dquot *i_dquot[MAXQUOTAS];
71. #endif
72. struct list_head i_devices;
73. union {
74. struct pipe_inode_info *i_pipe; /* 管道设备 */
75. struct block_device *i_bdev; /* block device块设备 */
76. struct cdev *i_cdev; /* 字符设备 */
77. };
79. __u32 i_generation;
81. #ifdef CONFIG_FSNOTIFY
82. __u32 i_fsnotify_mask; /* all events this inode cares about */
83. struct hlist_head i_fsnotify_marks;
84. #endif
86. #ifdef CONFIG_IMA
87. atomic_t i_readcount; /* struct files open RO */
88. #endif
89. void *i_private; /* fs or device private pointer */
90. };
读过程源码分析

Ext3文件系统读过程相对比较简单，函数调用关系如下图所示：

读过程可以分为两大类：Direct_io方式和page_cache方式。对于Direct_io方式，首先通过filemap_write_and_wait_range函数将page cache中的数据与设备同步并且无效掉page cache中的内容，然后再通过ext3提供的direct_io方法从设备读取数据。

另一种是直接从page cache中获取数据，通过do_generic_file_read函数实现该方式。该函数的主要流程说明如下：

1，通过读地址从page cache的radix树中获取相应的page页。

2，如果对应的page页不存在，那么需要创建一个page，然后再从设备读取相应的数据更新至page页。

3，当page页准备完毕之后，从页中拷贝数据至用户空间，page_cache方式的读操作完成。

写过程源码分析

Ext3的写过程主要分为direct_io写过程和page cache写过程两大类，整个写过程的函数调用关系如下图所示：

写操作的核心函数是__generic_file_aio_write，该函数实现如下：
1. ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2. unsigned long nr_segs, loff_t *ppos)
3. {
4. struct file *file = iocb->ki_filp;
5. /* 获取address space映射信息 */
6. struct address_space * mapping = file->f_mapping;
7. size_t ocount; /* original count */
8. size_t count; /* after file limit checks */
9. struct inode *inode = mapping->host; /* 获取文件inode索引节点 */
10. loff_t pos;
11. ssize_t written;
12. ssize_t err;
14. ocount = 0;
15. /* 检验数据区域是否存在问题，数据由iov数据结构管理 */
16. err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
17. if (err)
18. return err;
19. /* ocount为可以写入的数据长度 */
20. count = ocount;
21. pos = *ppos;
23. vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
25. /* We can write back this queue in page reclaim */
26. current->backing_dev_info = mapping->backing_dev_info;
27. written = 0;
28. /* 边界检查，需要判断写入数据是否超界、小文件边界检查以及设备是否是read-only。如果超界，那么降低写入数据长度 */
29. err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
30. if (err)
31. goto out;
32. /* count为实际可以写入的数据长度，如果写入数据长度为0，直接结束 */
33. if (count == 0)
34. goto out;
36. err = file_remove_suid(file);
37. if (err)
38. goto out;
40. file_update_time(file);
42. /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
43. if (unlikely(file->f_flags & O_DIRECT)) {
44. /* Direct IO操作模式，该模式会bypass Page Cache，直接将数据写入磁盘设备 */
45. loff_t endbyte;
46. ssize_t written_buffered;
47. /* 将对应page cache无效掉，然后将数据直接写入磁盘 */
48. written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
49. ppos, count, ocount);
50. if (written < 0 || written == count)
51. /* 所有数据已经写入磁盘，正确返回 */
52. goto out;
53. /*
54. * direct-io write to a hole: fall through to buffered I/O
55. * for completing the rest of the request.
56. */
57. pos += written;
58. count -= written;
59. /* 有些请求由于没有和块大小（通常为512字节）对齐，那么将无法正确完成direct-io操作。在__blockdev_direct_IO 函数中会检查逻辑地址是否和块大小对齐，__blockdev_direct_IO无法处理不对齐的请求。另外，在ext3逻辑地址和物理块地址映射操作函数ext3_get_block返回失败时，无法完成buffer_head的映射，那么request请求也将无法得到正确处理。所有没有得到处理的请求通过 buffer写的方式得到处理。从这点来看，direct_io并没有完全bypass page cache，在有些情况下是一种写无效模式。generic_file_buffered_write函数完成buffer写，将数据直接写入page cache */
60. written_buffered = generic_file_buffered_write(iocb, iov,
61. nr_segs, pos, ppos, count,
62. written);
63. /*
64. * If generic_file_buffered_write() retuned a synchronous error
65. * then we want to return the number of bytes which were
66. * direct-written, or the error code if that was zero. Note
67. * that this differs from normal direct-io semantics, which
68. * will return -EFOO even if some bytes were written.
69. */
70. if (written_buffered < 0) {
71. /* 如果page cache写失败，那么返回写成功的数据长度 */
72. err = written_buffered;
73. goto out;
74. }
76. /*
77. * We need to ensure that the page cache pages are written to
78. * disk and invalidated to preserve the expected O_DIRECT
79. * semantics.
80. */
81. endbyte = pos + written_buffered - written - 1;
82. /* 将page cache中的数据同步到磁盘 */
83. err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
84. if (err == 0) {
85. written = written_buffered;
86. /* 将page cache无效掉，保证下次读操作从磁盘获取数据 */
87. invalidate_mapping_pages(mapping,
88. pos >> PAGE_CACHE_SHIFT,
89. endbyte >> PAGE_CACHE_SHIFT);
90. } else {
91. /*
92. * We don't know how much we wrote, so just return
93. * the number of bytes which were direct-written
94. */
95. }
96. } else {
97. /* 将数据写入page cache。绝大多数的ext3写操作都会采用page cache写方式，通过后台writeback线程将page cache同步到硬盘 */
98. written = generic_file_buffered_write(iocb, iov, nr_segs,
99. pos, ppos, count, written);
100. }
101. out:
102. current->backing_dev_info = NULL;
103. return written ? written : err;
104. }
从__generic_file_aio_write函数可以看出，ext3写操作主要分为两大类：一类为direct_io；另一类为buffer_io （page cache write）。Direct IO可以bypass page cache，直接将数据写入设备。下面首先分析一下direct_io的处理流程。

如果操作地址对应的page页存在于page cache中，那么首先需要将这些page页中的数据同磁盘进行同步，然后将这些page缓存页无效掉，从而保证后继读操作能够从磁盘获取最新数据。在代码实现过程中，还需要考虑预读机制引入的page缓存页，所以在数据写入磁盘之后，需要再次查找page cache的radix树，保证写入的地址范围没有数据被缓存。

Generic_file_direct_write是处理direct_io的主要函数，该函数的实现如下：
1. ssize_t
2. generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
3. unsigned long *nr_segs, loff_t pos, loff_t *ppos,
4. size_t count, size_t ocount)
5. {
6. struct file *file = iocb->ki_filp;
7. struct address_space *mapping = file->f_mapping;
8. struct inode *inode = mapping->host;
9. ssize_t written;
10. size_t write_len;
11. pgoff_t end;
13. if (count != ocount)
14. *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
16. write_len = iov_length(iov, *nr_segs);
17. end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
18. /* 将对应区域page cache中的新数据页刷新到设备，这个操作是同步的 */
19. written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
20. if (written)
21. goto out;
23. /*
24. * After a write we want buffered reads to be sure to go to disk to get
25. * the new data. We invalidate clean cached page from the region we're
26. * about to write. We do this *before* the write so that we can return
27. * without clobbering -EIOCBQUEUED from ->direct_IO().
28. */
29. /* 将page cache对应page 缓存无效掉，这样可以保证后继的读操作能从磁盘获取最新数据 */
30. if (mapping->nrpages) {
31. /* 无效对应的page缓存 */
32. written = invalidate_inode_pages2_range(mapping,
33. pos >> PAGE_CACHE_SHIFT, end);
34. /*
35. * If a page can not be invalidated, return 0 to fall back
36. * to buffered write.
37. */
38. if (written) {
39. if (written == -EBUSY)
40. return 0;
41. goto out;
42. }
43. }
44. /* 调用ext3文件系统的direct io方法，将数据写入磁盘 */
45. written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
47. /*
48. * Finally, try again to invalidate clean pages which might have been
49. * cached by non-direct readahead, or faulted in by get_user_pages()
50. * if the source of the write was an mmap'ed region of the file
51. * we're writing. Either one is a pretty crazy thing to do,
52. * so we don't support it 100%. If this invalidation
53. * fails, tough, the write still worked...
54. */
55. /* 再次无效掉由于预读操作导致的对应地址的page cache缓存页 */
56. if (mapping->nrpages) {
57. invalidate_inode_pages2_range(mapping,
58. pos >> PAGE_CACHE_SHIFT, end);
59. }
61. if (written > 0) {
62. pos += written;
63. if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
64. i_size_write(inode, pos);
65. mark_inode_dirty(inode);
66. }
67. *ppos = pos;
68. }
69. out:
70. return written;
71. }
generic_file_direct_write函数中刷新page cache的函数调用关系描述如下：

filemap_write_and_wait_range à__filemap_fdatawrite_rangeà do_writepages

do_writepages函数的作用是将page页中的数据同步到设备，该函数实现如下：
1. int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
2. {
3. int ret;
5. if (wbc->nr_to_write <= 0)
6. return 0;
7. if (mapping->a_ops->writepages)
8. /* 如果文件系统定义了writepages方法，调用该方法刷新page cache页 */
9. ret = mapping->a_ops->writepages(mapping, wbc);
10. else
11. /* ext3没有定义writepages方法，因此调用generic_writepages()函数将page cache中的脏页刷新到磁盘 */
12. ret = generic_writepages(mapping, wbc);
13. return ret;
14. }
从上述分析可以看出，direct_io需要块大小对齐，否则还会调用page cache的路径。为了提高I/O性能，通常情况下ext3都会采用page cache异步写的方式。这也就是ext3的第二种写操作方式，该方式实现的关键函数是generic_file_buffered_write，其实现如下：
1. ssize_t
2. generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
3. unsigned long nr_segs, loff_t pos, loff_t *ppos,
4. size_t count, ssize_t written)
5. {
6. struct file *file = iocb->ki_filp;
7. ssize_t status;
8. struct iov_iter i;
10. iov_iter_init(&i, iov, nr_segs, count, written);
11. /* 执行page cache写操作 */
12. status = generic_perform_write(file, &i, pos);
14. if (likely(status >= 0)) {
15. written += status;
16. *ppos = pos + status;
17. }
19. return written ? written : status;
20. }
generic_file_buffered_write其实是对generic_perform_write函数的封装，generic_perform_write实现了page cache写的所有流程，该函数实现如下：
1. static ssize_t generic_perform_write(struct file *file,
2. struct iov_iter *i, loff_t pos)
3. {
4. struct address_space *mapping = file->f_mapping;
5. const struct address_space_operations *a_ops = mapping->a_ops; /* 映射处理函数集 */
6. long status = 0;
7. ssize_t written = 0;
8. unsigned int flags = 0;
10. /*
11. * Copies from kernel address space cannot fail (NFSD is a big user).
12. */
13. if (segment_eq(get_fs(), KERNEL_DS))
14. flags |= AOP_FLAG_UNINTERRUPTIBLE;
16. do {
17. struct page *page;
18. unsigned long offset; /* Offset into pagecache page */
19. unsigned long bytes; /* Bytes to write to page */
20. size_t copied; /* Bytes copied from user */
21. void *fsdata;
23. offset = (pos & (PAGE_CACHE_SIZE - 1));
24. bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
25. iov_iter_count(i));
27. again:
28. /*
29. * Bring in the user page that we will copy from _first_.
30. * Otherwise there's a nasty deadlock on copying from the
31. * same page as we're writing to, without it being marked
32. * up-to-date.
33. *
34. * Not only is this an optimisation, but it is also required
35. * to check that the address is actually valid, when atomic
36. * usercopies are used, below.
37. */
38. if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
39. status = -EFAULT;
40. break;
41. }
42. /* 调用ext3中的write_begin函数（inode.c中）ext3_write_begin，如果写入的page页不存在，那么ext3_write_begin会创建一个Page页，然后从硬盘中读入相应的数据 */
43. status = a_ops->write_begin(file, mapping, pos, bytes, flags,
44. &page, &fsdata);
45. if (unlikely(status))
46. break;
48. if (mapping_writably_mapped(mapping))
49. flush_dcache_page(page);
51. pagefault_disable();
52. /* 将数据拷贝到page cache中 */
53. copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
54. pagefault_enable();
55. flush_dcache_page(page);
57. mark_page_accessed(page);
58. /* 调用ext3的write_end函数（inode.c中），写完数据之后会将page页标识为dirty，后台writeback线程会将dirty page刷新到设备 */
59. status = a_ops->write_end(file, mapping, pos, bytes, copied,
60. page, fsdata);
61. if (unlikely(status < 0))
62. break;
63. copied = status;
65. cond_resched();
67. iov_iter_advance(i, copied);
68. if (unlikely(copied == 0)) {
69. /*
70. * If we were unable to copy any data at all, we must
71. * fall back to a single segment length write.
72. *
73. * If we didn't fallback here, we could livelock
74. * because not all segments in the iov can be copied at
75. * once without a pagefault.
76. */
77. bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
78. iov_iter_single_seg_count(i));
79. goto again;
80. }
81. pos += copied;
82. written += copied;
84. balance_dirty_pages_ratelimited(mapping);
85. if (fatal_signal_pending(current)) {
86. status = -EINTR;
87. break;
88. }
89. } while (iov_iter_count(i));
91. return written ? written : status;
92. }
本文出自 “存储之道” 博客，请务必保留此出处http://alanwu.blog.51cto.com/3652632/1106506
查看全文

相关阅读:
.NET文件格式相关开源项目
 ASP.NET配置文件Web.config 详细解释
 Allow user to select camera or gallery for image
android之调用webservice 实现图片上传
 使用 iTextSharp 生成 PDF 表格
 Android：如何显示网络图片
 Android的HttpClient和WebView session不同的问题
 提高你开发效率的十五个Visual Studio 2010使用技巧
 HTML5 Audio/Video 标签,属性,方法,事件汇总
 ASP.NET Web 项目文件类型

原文地址：https://www.cnblogs.com/CosyAndStone/p/3261114.html