vfs open系统调用flow之具体文件系统lookup(ext4 fs lookup)
ext4_lookup
static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
ext4_lookup参数列表说明如下:
dir:当前目录@dentry的父目录
dentry:需要查找的当前目录
ext4_lookup()首先调用了ext4_lookup_entry,这个函数根据当前路径的dentry的d_name成员在当前目录的父目录文件(用inode表示)里查找,这个会open父目录文件会涉及到io读操作。查找到后,得到当前目录的ext4_dir_entry_2,此结构体里有当前目录的inode number,然后根据此inode number调用ext4_iget,获得这个inode number对应的inode struct,得到这个inode后调用d_splice_alias()将dentry和inode绑定,即将inode赋值给dentry的d_inode成员。
ext4_lookup_entry
ext4_lookup_entry里的ext4_fname_prepare_lookup根据dentry的d_name成员设置fname,__ext4_find_entry将根据这个fname进行查找
static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct dentry *dentry,
struct ext4_dir_entry_2 **res_dir)
{
int err;
struct ext4_filename fname;
struct buffer_head *bh;
err = ext4_fname_prepare_lookup(dir, dentry, &fname);
if (err == -ENOENT)
return NULL;
if (err)
return ERR_PTR(err);
bh = __ext4_find_entry(dir, &fname, res_dir, NULL);
ext4_fname_free_filename(&fname);
return bh;
}
static struct buffer_head *__ext4_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir,
int *inlined)
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
struct buffer_head *bh, *ret = NULL;
ext4_lblk_t start, block;
const u8 *name = fname->usr_fname->name;
size_t ra_max = 0; /* Number of bh's in the readahead
buffer, bh_use[] */
size_t ra_ptr = 0; /* Current index into readahead
buffer */
ext4_lblk_t nblocks;
int i, namelen, retval;
*res_dir = NULL;
sb = dir->i_sb;
namelen = fname->usr_fname->len;
if (namelen > EXT4_NAME_LEN)
return NULL;
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
ret = ext4_find_inline_entry(dir, fname, res_dir,
&has_inline_data);
if (has_inline_data) {
if (inlined)
*inlined = 1;
goto cleanup_and_exit;
}
}
if ((namelen <= 2) && (name[0] == '.') &&
(name[1] == '.' || name[1] == '\0')) {
/*
* "." or ".." will only be in the first block
* NFS may look up ".."; "." should be handled by the VFS
*/
block = start = 0;
nblocks = 1;
goto restart;
}
if (is_dx(dir)) {
ret = ext4_dx_find_entry(dir, fname, res_dir);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
goto cleanup_and_exit;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
ret = NULL;
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); //dir->i_size表示dir目录文件的size,所以nblocks表示目录文件size的block num
if (!nblocks) {
ret = NULL;
goto cleanup_and_exit;
}
start = EXT4_I(dir)->i_dir_start_lookup; //如果之前有读过此dir目录文件,i_dir_start_lookup是上次读这个目录文件找到了目标目录时的block idx;如果dir目录文件之前没有被open过,则i_dir_start_lookup为0
if (start >= nblocks)
start = 0;
block = start;
restart:
do {
/*
* We deal with the read-ahead logic here.
*/
cond_resched();
if (ra_ptr >= ra_max) {
/* Refill the readahead buffer */
ra_ptr = 0;
if (block < start)
ra_max = start - block;
else
ra_max = nblocks - block;
ra_max = min(ra_max, ARRAY_SIZE(bh_use));
retval = ext4_bread_batch(dir, block, ra_max, //这里进行io read操作,一次读ra_max block,ra表示read ahead(预读)
false /* wait */, bh_use);
if (retval) {
ret = ERR_PTR(retval);
ra_max = 0;
goto cleanup_and_exit;
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
goto next;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
(unsigned long) block);
brelse(bh);
ret = ERR_PTR(-EIO);
goto cleanup_and_exit;
}
if (!buffer_verified(bh) &&
!is_dx_internal_node(dir, block,
(struct ext4_dir_entry *)bh->b_data) &&
!ext4_dirent_csum_verify(dir,
(struct ext4_dir_entry *)bh->b_data)) {
EXT4_ERROR_INODE(dir, "checksumming directory "
"block %lu", (unsigned long)block);
brelse(bh);
ret = ERR_PTR(-EFSBADCRC);
goto cleanup_and_exit;
}
set_buffer_verified(bh);
i = search_dirblock(bh, dir, fname, //在读到的block buffer里查找,一次查找一个block,等查找了ra_max个block后仍然没有找到当前目录(fname),再调用上面的ext4_bread_batch()再读ra_max block。
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) { //条件满足,表示查找到目标目录/文件
EXT4_I(dir)->i_dir_start_lookup = block; //记录当前查找到的当前目录所在的block的index
ret = bh;
goto cleanup_and_exit;
} else {
brelse(bh);
if (i < 0)
goto cleanup_and_exit;
}
next:
if (++block >= nblocks) //条件成立表示查找到dir目录文件末尾了,因为block可能不是从dir目录文件开头处开始的,即可能不是block 0开始的,所以将block设置为0从目录文件头开始查找,直到block等于start将停止这个while(这个条件即下面while(block != start))
block = 0;
} while (block != start);
/*
* If the directory has grown while we were searching, then
* search the last part of the directory before giving up.
*/
block = nblocks;
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
if (block < nblocks) {
start = 0;
goto restart;
}
cleanup_and_exit:
/* Clean up the read-ahead blocks */
for (; ra_ptr < ra_max; ra_ptr++)
brelse(bh_use[ra_ptr]);
return ret;
}
ext4_search_dir()参数说明:
bh: dir目录文件内容的buffer head
search_buf: 即bh的data buffer;
buf_size:文件系统super block结构体里的block size成员;
fname: 要查找的目录的name
dir目录文件内容由ext4_dir_entry_2这样的结构体组成,其结构示意图如下,所以ext4_search_dir根据目标目录name fname在bh里查找即可,查找到后将匹配的ext4_dir_entry_2赋值给res_dir。
(上述图片123、dir_abc是目录文件里子目录/文件的名字,rec_len表示用ext4_dir_entry_2来描述的子目录/文件项的大小)
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
struct inode *dir, struct ext4_filename *fname,
unsigned int offset, struct ext4_dir_entry_2 **res_dir)
{
struct ext4_dir_entry_2 * de;
char * dlimit;
int de_len;
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
if ((char *) de + de->name_len <= dlimit &&
ext4_match(fname, de)) {
/* found a match - just to be sure, do
* a full check */
if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
buf_size, offset))
return -1;
*res_dir = de;
return 1;
}
/* prevent looping on a bad block */
de_len = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
if (de_len <= 0)
return -1;
offset += de_len;
de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
}
return 0;
}
__ext4_iget
__ext4_iget()参数描述如下:
sb: 当前目录/文件的父目录inode struct里的i_sb成员,即super_block成员;
ino: 当前目录/文件的inode number
__ext4_iget首先调用iget_locked(),此函数先在inode inode_hashtable(icache)上查找,查找匹配的原则是inode num相等并且super_block一样。如果在icache上没有找到,则会分配一个indoe struct,分配后会将这个inode struct插入inode hash链表上。
然后判断iget_locked反馈的inode struct的i_state是否有I_NEW flag,如果没有,说明是从icache上查找到的,直接返回这个inode struct,__ext4_iget结束;如果有这个flag则说明是刚alloc的,则需要设置这个inode struct,其中一个很重要的是设置其i_fop成员,设置为ext4_file_operations(对于是文件case而非目录文件case时)。此inode->i_fop将会赋值给file struct的f_op成员,open之后,read/write等系统调用将使用file struct里的f_op file_operations函数集来read、write:
const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, };
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; int block; uid_t i_uid; gid_t i_gid; projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); } inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; ei = EXT4_I(inode); iloc.bh = NULL; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { ext4_error_inode(inode, function, line, 0, "iget: root inode unallocated"); ret = -EFSCORRUPTED; goto bad_inode; } if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; goto bad_inode; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { ext4_error_inode(inode, function, line, 0, "iget: bad extra_isize %u " "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } } else ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { ext4_error_inode(inode, function, line, 0, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); else i_projid = EXT4_DEF_PROJID; if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { if ((inode->i_mode == 0 || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted */ ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete * the process of deleting those. * OR it is the EXT4_BOOT_LOADER_INO which is * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } /* * If dir_index is not enabled but there's dir with INDEX flag set, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); ret = -EFSCORRUPTED; goto bad_inode; } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { ret = ext4_iget_extra_inode(inode, raw_inode, ei); if (ret) goto bad_inode; } } EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = le32_to_cpu(raw_inode->i_disk_version); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; if (ei->i_file_acl && !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else ret = ext4_ind_check_inode(inode); } } if (ret) goto bad_inode; if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { ext4_error_inode(inode, function, line, 0, "iget: immutable or append flags " "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } if (ext4_encrypted_inode(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; ext4_set_aops(inode); } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); } else { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); } inode_nohighmem(inode); } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { ret = -EFSCORRUPTED; ext4_error_inode(inode, function, line, 0, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } brelse(iloc.bh); unlock_new_inode(inode); return inode; bad_inode: brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); }
struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } inode = alloc_inode(sb); if (inode) { struct inode *old; spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ return inode; } /* * Uhhuh, somebody else created the same inode under * us. Use the old inode instead of the one we just * allocated. */ spin_unlock(&inode_hash_lock); destroy_inode(inode); if (IS_ERR(old)) return NULL; inode = old; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; }
static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) { struct inode *inode = NULL; repeat: hlist_for_each_entry(inode, head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); return inode; } return NULL; }