【转】Linux Writeback机制分析

zoukankan html css js c++ java

【转】Linux Writeback机制分析
1. bdi是什么?

    bdi，即是backing device info的缩写，顾名思义它描述备用存储设备相关描述信息，这在内核代码里用一个结构体backing_dev_info来表示。

    bdi，备用存储设备，简单点说就是能够用来存储数据的设备，而这些设备存储的数据能够保证在计算机电源关闭时也不丢失。这样说来，软盘存储设备、光驱存储设备、USB存储设备、硬盘存储设备都是所谓的备用存储设备（后面都用bdi来指示），而内存显然不是

2. bdi工作模型

相对于内存来说，bdi设备（比如最常见的硬盘存储设备）的读写速度是非常慢的，因此为了提高系统整体性能，Linux系统对bdi设备的读写内容进行了缓冲，那些读写的数据会临时保存在内存里，以避免每次都直接操作bdi设备，但这就需要在一定的时机（比如每隔5秒、脏数据达到的一定的比率等）把它们同步到bdi设备，否则长久的呆在内存里容易丢失（比如机器突然宕机、重启），而进行间隔性同步工作的进程之前名叫pdflush，但后来在Kernel 2.6.2x/3x对此进行了优化改进，产生有多个内核进程，bdi-default、flush-x:y等。

   关于以前的pdflush不再多说，我们这里只讨论bdi-default和flush-x:y，这两个进程（事实上，flush-x:y为多个）的关系为父与子的关系，即bdi-default根据当前的状态Create或Destroy flush-x:y，x为块设备类型，y为此类设备的序号。如有两个TF卡，则分别为：flush-179:0、flush-179:1。

    一般而言，一个Linux系统会挂载很多bdi设备，在bdi设备注册（函数：bdi_register(…)）时，这些bdi设备会以链表的形式组织在全局变量bdi_list下，除了一个比较特别的bdi设备以外，它就是default bdi设备（default_backing_dev_info），它除了被加进到bdi_list，还会新建一个bdi-default内核进程，即本文的主角。具体代码如下，我相信你一眼就能注意到kthread_run和list_add_tail_rcu这样的关键代码。
[cpp] view plain copy

struct backing_dev_info default_backing_dev_info = {

    .name       = "default",

    .ra_pages   = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,

    .state      = 0,

    .capabilities   = BDI_CAP_MAP_COPY,

};

EXPORT_SYMBOL_GPL(default_backing_dev_info);
[cpp] view plain copy

static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)

{

    return bdi == &default_backing_dev_info;

}



int bdi_register(struct backing_dev_info *bdi, struct device *parent,

        const char *fmt, ...)

{

    va_list args;

    struct device *dev;



    if (bdi->dev)    /* The driver needs to use separate queues per device */

        return 0;



    va_start(args, fmt);

    dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);

    va_end(args);

    if (IS_ERR(dev))

        return PTR_ERR(dev);



    bdi->dev = dev;



    /*

     * Just start the forker thread for our default backing_dev_info,

     * and add other bdi's to the list. They will get a thread created

     * on-demand when they need it.

     */

    if (bdi_cap_flush_forker(bdi)) {

        struct bdi_writeback *wb = &bdi->wb;



        wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",

                        dev_name(dev));

        if (IS_ERR(wb->task))

            return PTR_ERR(wb->task);

    }



    bdi_debug_register(bdi, dev_name(dev));

    set_bit(BDI_registered, &bdi->state);



    spin_lock_bh(&bdi_lock);

    list_add_tail_rcu(&bdi->bdi_list, &bdi_list);

    spin_unlock_bh(&bdi_lock);



    trace_writeback_bdi_register(bdi);

    return 0;

}

EXPORT_SYMBOL(bdi_register);
接着跟进函数bdi_forker_thread，它是bdi-default内核进程的主体：
[cpp] view plain copy

static int bdi_forker_thread(void *ptr)

{

struct bdi_writeback *me = ptr;



current->flags |= PF_SWAPWRITE;

set_freezable();



/*

* Our parent may run at a different priority, just set us to normal

*/

set_user_nice(current, 0);



for (;;) {

struct task_struct *task = NULL;

struct backing_dev_info *bdi;

enum {

NO_ACTION, /* Nothing to do */

FORK_THREAD, /* Fork bdi thread */

KILL_THREAD, /* Kill inactive bdi thread */

} action = NO_ACTION;



/*

* Temporary measure, we want to make sure we don't see

* dirty data on the default backing_dev_info

*/

if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {

del_timer(&me->wakeup_timer);

wb_do_writeback(me, 0);

}



spin_lock_bh(&bdi_lock);

/*

* In the following loop we are going to check whether we have

* some work to do without any synchronization with tasks

* waking us up to do work for them. Set the task state here

* so that we don't miss wakeups after verifying conditions.

*/

set_current_state(TASK_INTERRUPTIBLE);

/* 遍历所有的bdi对象，检查这些bdi是否存在脏数据，如果有脏数据，那么需要为其fork线程，然后做writeback操作 */

list_for_each_entry(bdi, &bdi_list, bdi_list) {

bool have_dirty_io;



if (!bdi_cap_writeback_dirty(bdi) ||

bdi_cap_flush_forker(bdi))

continue;



WARN(!test_bit(BDI_registered, &bdi->state),

"bdi %p/%s is not registered! ", bdi, bdi->name);

/* 检查是否存在脏数据 */

have_dirty_io = !list_empty(&bdi->work_list) ||

wb_has_dirty_io(&bdi->wb);



/*

* If the bdi has work to do, but the thread does not

* exist - create it.

*/

if (!bdi->wb.task && have_dirty_io) {

/*

* Set the pending bit - if someone will try to

* unregister this bdi - it'll wait on this bit.

*/

/* 如果有脏数据，并且不存在线程，那么接下来做线程的FORK操作 */

set_bit(BDI_pending, &bdi->state);

action = FORK_THREAD;

break;

}



spin_lock(&bdi->wb_lock);



/*

* If there is no work to do and the bdi thread was

* inactive long enough - kill it. The wb_lock is taken

* to make sure no-one adds more work to this bdi and

* wakes the bdi thread up.

*/

/* 如果一个bdi长时间没有脏数据，那么执行线程的KILL操作，结束掉该bdi对应的writeback线程 */

if (bdi->wb.task && !have_dirty_io &&

time_after(jiffies, bdi->wb.last_active +

bdi_longest_inactive())) {

task = bdi->wb.task;

bdi->wb.task = NULL;

spin_unlock(&bdi->wb_lock);

set_bit(BDI_pending, &bdi->state);

action = KILL_THREAD;

break;

}

spin_unlock(&bdi->wb_lock);

}

spin_unlock_bh(&bdi_lock);



/* Keep working if default bdi still has things to do */

if (!list_empty(&me->bdi->work_list))

__set_current_state(TASK_RUNNING);

/* 执行线程的FORK和KILL操作 */

switch (action) {

case FORK_THREAD:

/* FORK一个bdi_writeback_thread线程，该线程的名字为flush-major:minor */

__set_current_state(TASK_RUNNING);

task = kthread_create(bdi_writeback_thread, &bdi->wb,

"flush-%s", dev_name(bdi->dev));

if (IS_ERR(task)) {

/*

* If thread creation fails, force writeout of

* the bdi from the thread. Hopefully 1024 is

* large enough for efficient IO.

*/

writeback_inodes_wb(&bdi->wb, 1024,

WB_REASON_FORKER_THREAD);

} else {

/*

* The spinlock makes sure we do not lose

* wake-ups when racing with 'bdi_queue_work()'.

* And as soon as the bdi thread is visible, we

* can start it.

*/

spin_lock_bh(&bdi->wb_lock);

bdi->wb.task = task;

spin_unlock_bh(&bdi->wb_lock);

wake_up_process(task);

}

bdi_clear_pending(bdi);

break;



case KILL_THREAD:

/* KILL一个线程 */

__set_current_state(TASK_RUNNING);

kthread_stop(task);

bdi_clear_pending(bdi);

break;



case NO_ACTION:

/* 如果没有可执行的动作，那么调度本线程睡眠一段时间 */

if (!wb_has_dirty_io(me) || !dirty_writeback_interval)

/*

* There are no dirty data. The only thing we

* should now care about is checking for

* inactive bdi threads and killing them. Thus,

* let's sleep for longer time, save energy and

* be friendly for battery-driven devices.

*/

schedule_timeout(bdi_longest_inactive());

else

schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));

try_to_freeze();

break;

}

}



return 0;

}
3. bdi相关数据结构

    在bdi数据结构中定义了一个writeback对象，该对象是对writeback内核线程的描述，并且封装了需要处理的inode队列。在bdi数据结构中有一条work_list，该work队列维护了writeback内核线程需要处理的任务。如果该队列上没有work可以处理，那么writeback内核线程将会睡眠等待。

    writeback

    writeback对象封装了内核线程task以及需要处理的inode队列。当page cache/buffer cache需要刷新radix tree上的inode时，可以将该inode挂载到writeback对象的b_dirty队列上，然后唤醒writeback线程。在处理过程中，inode会被移到b_io队列上进行处理。多条链表的方式可以降低多线程之间的资源共享。writeback数据结构具体定义如下：
[cpp] view plain copy

struct bdi_writeback {

struct backing_dev_info *bdi; /* our parent bdi */

unsigned int nr;



unsigned long last_old_flush; /* last old data flush */

unsigned long last_active; /* last time bdi thread was active */



struct task_struct *task; /* writeback thread */

struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */

struct list_head b_dirty; /* dirty inodes */

struct list_head b_io; /* parked for writeback */

struct list_head b_more_io; /* parked for more writeback */

spinlock_t list_lock; /* protects the b_* lists */

};
writeback work

wb_writeback_work数据结构是对writeback任务的封装，不同的任务可以采用不同的刷新策略。writeback线程的处理对象就是writeback_work。如果writeback_work队列为空，那么内核线程就可以睡眠了。

Writeback_work的数据结构定义如下：
[cpp] view plain copy

struct wb_writeback_work {

long nr_pages;

struct super_block *sb; /* superblock对象 */

unsigned long *older_than_this;

enum writeback_sync_modes sync_mode;

unsigned int tagged_writepages:1;

unsigned int for_kupdate:1;

unsigned int range_cyclic:1;

unsigned int for_background:1;

enum wb_reason reason; /* why was writeback initiated? */



struct list_head list; /* pending work list，链入bdi-> work_list队列 */

struct completion *done; /* set if the caller waits，work完成时通知调用者 */

};
4. writeback主要函数分析

   writeback机制的主要函数包括如下两个方面：

   1. 管理bdi对象并且fork相应的writeback内核线程处理cache数据的刷新工作。

   2. writeback内核线程处理函数，实现dirty page的刷新操作

writeback线程管理

    Linux中有一个内核守护线程，该线程用来管理系统bdi队列，并且负责为block device创建writeback thread。当bdi中有dirty page并且还没有为bdi分配内核线程的时候，bdi_forker_thread程序会为其分配线程资源；当一个writeback线程长时间处于空闲状态时，bdi_forker_thread程序会释放该线程资源。

转自：http://blog.csdn.net/myarrow/article/details/8918944
查看全文

相关阅读:
《代码阅读方法与实践》阅读笔记之二
 《代码阅读方法与实践》阅读笔记一
 专业实训题目需求分析
 阅读计划
 第二阶段Sprint10
第二阶段Sprint9
第二阶段Sprint8
第二阶段Sprint7
第二阶段个人工作总结（8）
第二阶段个人工作总结（7）

原文地址：https://www.cnblogs.com/MerlinJ/p/4058694.html

【转】Linux Writeback机制分析

1. bdi是什么?

2. bdi工作模型

3. bdi相关数据结构

4. writeback主要函数分析