zoukankan      html  css  js  c++  java
  • 写文件的流程

    很多文件系统都是通过generic_file_write()函数来实现文件对象的write方法,即write(库函数)->sys_write()->generic_file_write():

    ssize_t generic_file_write(struct file *file, const char __user *buf,
    			   size_t count, loff_t *ppos)
    {
    	struct address_space *mapping = file->f_mapping;
    	struct inode *inode = mapping->host;
    	ssize_t	ret;
    	struct iovec local_iov = { .iov_base = (void __user *)buf,
    					.iov_len = count };
    
    	down(&inode->i_sem);
    	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
    	up(&inode->i_sem);
    
    	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
    		ssize_t err;
    
    		err = sync_page_range(inode, mapping, *ppos - ret, ret);
    		if (err < 0)
    			ret = err;
    	}
    	return ret;
    }

    generic_file_write会调用__generic_file_write_nolock(),即write(库函数)->sys_write()->generic_file_write()->__generic_file_write_nolock:

    ssize_t
    __generic_file_write_nolock(struct file *file, const struct iovec *iov,
    				unsigned long nr_segs, loff_t *ppos)
    {
    	struct kiocb kiocb;
    	ssize_t ret;
    
    	init_sync_kiocb(&kiocb, file);
    	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
    	if (ret == -EIOCBQUEUED)
    		ret = wait_on_sync_kiocb(&kiocb);
    	return ret;
    }

    write(库函数)->sys_write()->generic_file_write()->__generic_file_write_nolock()->__generic_file_aio_write_nolock():

    ssize_t
    __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
    				unsigned long nr_segs, loff_t *ppos)
    {
    	struct file *file = iocb->ki_filp;
    	struct address_space * mapping = file->f_mapping;
    	size_t ocount;		/* original count */
    	size_t count;		/* after file limit checks */
    	struct inode 	*inode = mapping->host;
    	unsigned long	seg;
    	loff_t		pos;
    	ssize_t		written;
    	ssize_t		err;
    
    	ocount = 0;
    	for (seg = 0; seg < nr_segs; seg++) {
    		const struct iovec *iv = &iov[seg];
    
    		/*
    		 * If any segment has a negative length, or the cumulative
    		 * length ever wraps negative then return -EINVAL.
    		 */
    		ocount += iv->iov_len;
    		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
    			return -EINVAL;
    		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
    			continue;
    		if (seg == 0)
    			return -EFAULT;
    		nr_segs = seg;
    		ocount -= iv->iov_len;	/* This segment is no good */
    		break;
    	}
    
    	count = ocount;
    	pos = *ppos;
    
    	/* We can write back this queue in page reclaim */
    	current->backing_dev_info = mapping->backing_dev_info;
    	written = 0;
    
    	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
    	if (err)
    		goto out;
    
    	if (count == 0)
    		goto out;
    
    	err = remove_suid(file->f_dentry);
    	if (err)
    		goto out;
    
    	inode_update_time(inode, 1);
    
    	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
    	if (unlikely(file->f_flags & O_DIRECT)) {
    		written = generic_file_direct_write(iocb, iov,
    				&nr_segs, pos, ppos, count, ocount);
    		if (written < 0 || written == count)
    			goto out;
    		/*
    		 * direct-io write to a hole: fall through to buffered I/O
    		 * for completing the rest of the request.
    		 */
    		pos += written;
    		count -= written;
    	}
    
    	written = generic_file_buffered_write(iocb, iov, nr_segs,
    			pos, ppos, count, written);
    out:
    	current->backing_dev_info = NULL;
    	return written ? written : err;
    }

    write(库函数)->sys_write()->generic_file_write()->__generic_file_write_nolock()->__generic_file_aio_write_nolock()->generic_file_buffered_write():

    ssize_t
    generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
    		unsigned long nr_segs, loff_t pos, loff_t *ppos,
    		size_t count, ssize_t written)
    {
    	struct file *file = iocb->ki_filp;
    	struct address_space * mapping = file->f_mapping;
    	struct address_space_operations *a_ops = mapping->a_ops;
    	struct inode 	*inode = mapping->host;
    	long		status = 0;
    	struct page	*page;
    	struct page	*cached_page = NULL;
    	size_t		bytes;
    	struct pagevec	lru_pvec;
    	const struct iovec *cur_iov = iov; /* current iovec */
    	size_t		iov_base = 0;	   /* offset in the current iovec */
    	char __user	*buf;
    
    	pagevec_init(&lru_pvec, 0);
    
    	buf = iov->iov_base + written;	/* handle partial DIO write */
    	do {
    		unsigned long index;
    		unsigned long offset;
    		size_t copied;
    
    		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
    		//获取要写的缓冲页面索引(怎样依据页索引在radix树中获取到指定页描写叙述符,ULK-PAGE600)
    		index = pos >> PAGE_CACHE_SHIFT;
    		bytes = PAGE_CACHE_SIZE - offset;
    		//最后剩一点写入内容的处理
    		if (bytes > count)
    			bytes = count;
    
    		/*
    		 * Bring in the user page that we will copy from _first_.
    		 * Otherwise there's a nasty deadlock on copying from the
    		 * same page as we're writing to, without it being marked
    		 * up-to-date.
    		 */
    		fault_in_pages_readable(buf, bytes);
    		//在radix树里面查找要被写的page,假设不存在则创建一个,见以下分析
    		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
    		if (!page) {
    			status = -ENOMEM;
    			break;
    		}
    		//为这个page准备一组buffer_head结构,用于描写叙述组成这个page的数据块,见以下分析
    		status = a_ops->prepare_write(file, page, offset, offset+bytes);
    		if (unlikely(status)) {
    			loff_t isize = i_size_read(inode);
    			/*
    			 * prepare_write() may have instantiated a few blocks
    			 * outside i_size.  Trim these off again.
    			 */
    			unlock_page(page);
    			page_cache_release(page);
    			if (pos + bytes > isize)
    				vmtruncate(inode, isize);
    			break;
    		}
    		if (likely(nr_segs == 1))
    			copied =  (page, offset,
    							buf, bytes);
    		else
    			copied = filemap_copy_from_user_iovec(page, offset,
    						cur_iov, iov_base, bytes);
    		flush_dcache_page(page);
    		//把基础缓冲区标记为脏,以便随后把他们都写到磁盘。
    		status = a_ops->commit_write(file, page, offset, offset+bytes);
    		if (likely(copied > 0)) {
    			if (!status)
    				status = copied;
    
    			if (status >= 0) {
    				written += status;
    				count -= status;
    				pos += status;
    				buf += status;
    				if (unlikely(nr_segs > 1))
    					filemap_set_next_iovec(&cur_iov,
    							&iov_base, status);
    			}
    		}
    		if (unlikely(copied != bytes))
    			if (status >= 0)
    				status = -EFAULT;
    		unlock_page(page);
    		mark_page_accessed(page);
    		page_cache_release(page);
    		if (status < 0)
    			break;
    		balance_dirty_pages_ratelimited(mapping);
    		cond_resched();
    	} while (count);
    	*ppos = pos;
    
    	if (cached_page)
    		page_cache_release(cached_page);
    
    	/*
    	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
    	 */
    	if (likely(status >= 0)) {
    		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
    			if (!a_ops->writepage || !is_sync_kiocb(iocb))
    				status = generic_osync_inode(inode, mapping,
    						OSYNC_METADATA|OSYNC_DATA);
    		}
      	}
    	
    	/*
    	 * If we get here for O_DIRECT writes then we must have fallen through
    	 * to buffered writes (block instantiation inside i_size).  So we sync
    	 * the file data here, to try to honour O_DIRECT expectations.
    	 */
    	if (unlikely(file->f_flags & O_DIRECT) && written)
    		status = filemap_write_and_wait(mapping);
    
    	pagevec_lru_add(&lru_pvec);
    	return written ? written : status;
    }
    
    static inline struct page *
    __grab_cache_page(struct address_space *mapping, unsigned long index,
    			struct page **cached_page, struct pagevec *lru_pvec)
    {
    	int err;
    	struct page *page;
    repeat:
    	//依据address_space地址和缓冲页的索引,获取缓冲页面的描写叙述符(ULK-PAGE602)
    	page = find_lock_page(mapping, index);
    	if (!page) {
    		if (!*cached_page) {
    			*cached_page = page_cache_alloc(mapping);
    			if (!*cached_page)
    				return NULL;
    		}
    		//把一个新页的描写叙述符插入到页快速缓存--在radix树中出入新节点
    		err = add_to_page_cache(*cached_page, mapping,
    					index, GFP_KERNEL);
    		if (err == -EEXIST)
    			goto repeat;
    		if (err == 0) {
    			page = *cached_page;
    			page_cache_get(page);
    			if (!pagevec_add(lru_pvec, page))
    				__pagevec_lru_add(lru_pvec);
    			*cached_page = NULL;
    		}
    	}
    	return page;
    }
    

    //prepare_write分析
    address_space对象的prepare_write和commit_write方法专用于由generic_file_write()实现的通用写操作,这个函数适用于普通文件和块设备文件。每一个磁盘文件系统都定义了自己的prepare_write方法。与读操作类似,这种方法仅仅是普通函数的封装。比如,Ext2文件系统通过下列函数实现prepare_write方法
    //在fs/buffer.c文件夹中
    int ext2_prepare_write(struct file *file,struct page *page,unsigned from ,unsigned to)
    {
    return block_prepare_write(page,from,to,ext2_get_block);
    }
    一旦prepare_write,generic_file_write()函数就用存放在用户地址空间中的数据更新快速缓存页面。接下来,调用address_space对象的commit_write方法。这种方法由generic_commit_write()函数实现。generic_commit_write()函数运行例如以下步骤:
    1.调用__block_commit_write()函数,运行例如以下步骤:
    A.考虑页中受写操作影响的全部缓冲区;对于当中的每一个缓冲区,将相应缓冲区首部的BH_Uptodate和BH_Dirty标志置位。
    B.标记对应索引节点为脏,将索引节点增加超级块脏的索引节点连接
    C.假设缓冲区页中的全部缓冲区是最新的,则将PG_uptodate标志置位
    D.将页的PG_Dirty标志置位,并在基树中将页标记成脏

  • 相关阅读:
    Asp.MVC 各个版本比较(资源整合)与WebForm的区别
    20款最新且极具创意的jQuery插件(附下载)
    php java net 开发比较
    移动互联网渠道乱象
    基于微软IIS/.NET平台开发的知名网站 (补充)
    sqlsql语句查询优化总结,建议及写法技巧(汇总)
    总结关于对日外包的一些想法
    .net跨平台解决方案mono真正实现C#代码一次编写处处运行(微软已经正式支持夸平台框架aspnet core)
    .NET 常用经典学习资源网站推荐
    常用visual studio 插件工具
  • 原文地址:https://www.cnblogs.com/lcchuguo/p/4078422.html
Copyright © 2011-2022 走看看