zoukankan html css js c++ java

Linux3.10.0块IO子系统流程（2）-- 构造、排序、合并请求

Linux块设备可以分为三类。分别针对顺序访问物理设备、随机访问物理设备和逻辑设备（即“栈式设备”）

类型	make_request_fn	request_fn	备注
SCSI 设备等	从bio构造request（经过合并和排序），返回0	逐个处理request	调用blk_init_queue，使用默认的__make_request，提供策略例程
SSD等	直接处理bio，返回0	无	调用blk_alloc_queue，提供make_request_fn
RAID或Device Mapper设备	重定向bio，返回非零值	无	调用blk_alloc_queue，提供make_request_fn

blk_init_queue原型：

 1 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 2 {
 3     return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
 4 }
 5 EXPORT_SYMBOL(blk_init_queue);
 6 
 7 struct request_queue *
 8 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 9 {
10     struct request_queue *uninit_q, *q;
11     uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
12     if (!uninit_q)
13         return NULL;
14     q = blk_init_allocated_queue(uninit_q, rfn, lock);
15     if (!q)
16         blk_cleanup_queue(uninit_q);
17     return q;
18 }
19 EXPORT_SYMBOL(blk_init_queue_node);
20 
21 struct request_queue *
22 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
23              spinlock_t *lock)
24 {
25     if (!q)
26         return NULL;
27     if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
28         return NULL;
29     q->request_fn        = rfn;
30     q->prep_rq_fn        = NULL;
31     q->unprep_rq_fn        = NULL;
32     q->queue_flags        |= QUEUE_FLAG_DEFAULT;
33     /* Override internal queue lock with supplied lock pointer */
34     if (lock)
35         q->queue_lock        = lock;
36     /*
37      * This also sets hw/phys segments, boundary and size
38      */
39     blk_queue_make_request(q, blk_queue_bio);　　//使用blk_init_queue会默认绑定blk_queue_bio来处理IO
40     q->sg_reserved_size = INT_MAX;
41     /* init elevator */
42     if (elevator_init(q, NULL))    // 初始化IO调度
43         return NULL;
44     return q;
45 }
46 EXPORT_SYMBOL(blk_init_allocated_queue);

下面来跟踪blk_queue_bio函数：

  1 void blk_queue_bio(struct request_queue *q, struct bio *bio)
  2 {
  3     const bool sync = !!(bio->bi_rw & REQ_SYNC);
  4     struct blk_plug *plug;
  5     int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
  6     struct request *req;
  7     unsigned int request_count = 0;
  8     /*
  9      * low level driver can indicate that it wants pages above a
 10      * certain limit bounced to low memory (ie for highmem, or even
 11      * ISA dma in theory)
 12      */
 13     blk_queue_bounce(q, &bio);    // 如果需要，创建反弹缓冲区
 14     if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 15         bio_endio(bio, -EIO);
 16         return;
 17     }
 18     if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
 19         spin_lock_irq(q->queue_lock);
 20         where = ELEVATOR_INSERT_FLUSH;
 21         goto get_rq;
 22     }
 23     /*
 24      * Check if we can merge with the plugged list before grabbing any locks
 25      * 首先尝试请求合并
 26      */
 27     if (attempt_plug_merge(q, bio, &request_count))
 28         return;
 29     spin_lock_irq(q->queue_lock);
 30     el_ret = elv_merge(q, &req, bio);    // 判断是否bio是否可以合并
 31     // 如果可以合并的话，分为向前和向后合并
 32     if (el_ret == ELEVATOR_BACK_MERGE) {
 33         if (bio_attempt_back_merge(q, req, bio)) {
 34             elv_bio_merged(q, req, bio);    // 请求如果在硬件上允许，则进行合并
 35             if (!attempt_back_merge(q, req))    // 合并之后可能两个request可以合并
 36                 elv_merged_request(q, req, el_ret);
 37             goto out_unlock;
 38         }
 39     } else if (el_ret == ELEVATOR_FRONT_MERGE) {
 40         if (bio_attempt_front_merge(q, req, bio)) {
 41             elv_bio_merged(q, req, bio);
 42             if (!attempt_front_merge(q, req))
 43                 elv_merged_request(q, req, el_ret);
 44             goto out_unlock;
 45         }
 46     }
 47 // 不能合并就根据bio构造request
 48 get_rq:
 49     /*
 50      * This sync check and mask will be re-done in init_request_from_bio(),
 51      * but we need to set it earlier to expose the sync flag to the
 52      * rq allocator and io schedulers.
 53      */
 54     rw_flags = bio_data_dir(bio);
 55     if (sync)
 56         rw_flags |= REQ_SYNC;
 57     /*
 58      * Grab a free request. This is might sleep but can not fail.
 59      * Returns with the queue unlocked.
 60      */
 61     req = get_request(q, rw_flags, bio, GFP_NOIO);    // 获取一个request
 62     if (unlikely(!req)) {
 63         bio_endio(bio, -ENODEV);    /* @q is dead */
 64         goto out_unlock;
 65     }
 66     /*
 67      * After dropping the lock and possibly sleeping here, our request
 68      * may now be mergeable after it had proven unmergeable (above).
 69      * We don't worry about that case for efficiency. It won't happen
 70      * often, and the elevators are able to handle it.
 71      */
 72     init_request_from_bio(req, bio);    // 根据bio构造一个request，并添加到IO调度器队列
 73     if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
 74         req->cpu = raw_smp_processor_id();
 75     plug = current->plug;
 76     // 接下来是蓄流/泄流策略
 77     if (plug) {
 78         /*
 79          * If this is the first request added after a plug, fire
 80          * of a plug trace. If others have been added before, check
 81          * if we have multiple devices in this plug. If so, make a
 82          * note to sort the list before dispatch.
 83          */
 84         if (list_empty(&plug->list))
 85             trace_block_plug(q);
 86         else {
 87             if (request_count >= BLK_MAX_REQUEST_COUNT) {
 88                 blk_flush_plug_list(plug, false);
 89                 trace_block_plug(q);
 90             }
 91         }
 92         list_add_tail(&req->queuelist, &plug->list);
 93         drive_stat_acct(req, 1);
 94     } else {
 95         spin_lock_irq(q->queue_lock);
 96         add_acct_request(q, req, where);　　// 将请求添加到IO调度队列或请求队列，主要被用来处理屏障请求
 97         __blk_run_queue(q);
 98 out_unlock:
 99         spin_unlock_irq(q->queue_lock);
100     }
101 }
102 EXPORT_SYMBOL_GPL(blk_queue_bio);    /* for device mapper only */

第13行，blk_queue_bounce创建一个反弹缓冲区。通常是在驱动尝试在外围设备不可达到的地址。例如高端内存上执行DMA等。创建反弹缓冲区后，数据要在原缓冲区和反弹缓冲区之间进行与读写方向对应的复制。毫无疑问，使用反弹缓冲区会降低性能，但也没有其他办法。

所谓反弹，实际上是分配一个新的bio描述符，它和原始bio的segment一一对应。如果原始bio的segment使用的页面在DMA内存范围外，则分配一个在DMA范围内的页面，赋给新的bio对应的segment。对于写操作，需要将旧bio页面的内容复制到新的bio中。如果原始的bio的segment使用的页面在DMA范围内，则将新的bio指向同一地方。

最后将原始bio保存在新的bio的bi_private域中，并设置新bio的完成回调函数。

接下来交给IO调度器，由它负责合并和排序请求。合并是指将对磁盘上连续位置的请求合并为一个，通过一次SCSI命令完成。排序是将多个请求对磁盘上的访问位置顺序重新排列，使得磁头尽可能向一个方向移动。请求的合并和排序是在SCSI设备的请求队列描述符上进行的。

 1 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 2 {
 3     struct elevator_queue *e = q->elevator;
 4     struct request *__rq;
 5     int ret;
 6 
 7     /*
 8      * Levels of merges:
 9      *     nomerges:  No merges at all attempted
10      *     noxmerges: Only simple one-hit cache try
11      *     merges:       All merge tries attempted
12      */
13     if (blk_queue_nomerges(q))    // 如果设置了QUEUE_FLAG_NOMERGES的标志位，就直接返回不合并
14         return ELEVATOR_NO_MERGE;
15 
16     /*
17      * First try one-hit cache.
18      */
19     // 如果请求队列的last_merge有缓存下来的request，调用blk_try_merge来进行尝试和它进行合并，如果可以合并，通过参数输出这个req
20     if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) {
21         ret = blk_try_merge(q->last_merge, bio);
22         if (ret != ELEVATOR_NO_MERGE) {
23             *req = q->last_merge;
24             return ret;
25         }
26     }
27 
28      // 如果设置了QUEUE_FLAG_NOXMERGES的标志位，表明不要进行“扩展”的合并尝试
29     if (blk_queue_noxmerges(q))
30         return ELEVATOR_NO_MERGE;
31 
32     /*
33      * See if our hash lookup can find a potential backmerge.
34      * 后面的代码就是所谓的“扩展”合并尝试，它包含两方面的内容：
35      * 第一部分是各种IO调度算法全都适用的，而第二部分则是各种IO调度算法特定的
36      */
37     __rq = elv_rqhash_find(q, bio->bi_sector);
38     if (__rq && elv_rq_merge_ok(__rq, bio)) {
39         *req = __rq;
40         return ELEVATOR_BACK_MERGE;
41     }
42 
43     /*
44      * IO调度特定的合并算法是通过电梯队列操作表的elevator_merge_fn回调实现的
45      */
46     if (e->type->ops.elevator_merge_fn)
47         return e->type->ops.elevator_merge_fn(q, req, bio);
48 
49     return ELEVATOR_NO_MERGE;
50 }

如果我们的请求不能合并到现有的request中，那么就要新申请request描述符了，根据bio对它初始化，并添加到IO调度器队列

最后Linux块设备层采用蓄流/泄流技术来改进吞吐量，蓄流是为了将请求合并和排序，然后一起泄流，泄流函数为__blk_run_queue(q)

/**
* __blk_run_queue - run a single device queue
* @q:    The queue to run
*
* Description:
*    See @blk_run_queue. This variant must be called with the queue lock
*    held and interrupts disabled.
*/
void __blk_run_queue(struct request_queue *q)
{
    if (unlikely(blk_queue_stopped(q)))
        return;
    __blk_run_queue_uncond(q);
}



/**
* __blk_run_queue_uncond - run a queue whether or not it has been stopped
* @q:    The queue to run
*
* Description:
*    Invoke request handling on a queue if there are any pending requests.
*    May be used to restart request handling after a request has completed.
*    This variant runs the queue whether or not the queue has been
*    stopped. Must be called with the queue lock held and interrupts
*    disabled. See also @blk_run_queue.
*/
inline void __blk_run_queue_uncond(struct request_queue *q)
{
    if (unlikely(blk_queue_dead(q)))
        return;
    /*
     * Some request_fn implementations, e.g. scsi_request_fn(), unlock
     * the queue lock internally. As a result multiple threads may be
     * running such a request function concurrently. Keep track of the
     * number of active request_fn invocations such that blk_drain_queue()
     * can wait until all these request_fn calls have finished.
     */
    q->request_fn_active++;
    q->request_fn(q);    // 回调函数实例化为scsi_request_fn，也就是通常所说的SCSI策略例程
    q->request_fn_active--;
}

__blk_run_queue

对于SCSI设备，在为它分配请求队列时，将请求队列的request_fn回调函数实例化为scsi_request_fn，也就是通常所说的SCSI策略例程。

查看全文

相关阅读:
使用SignTool对软件安装包进行数字签名（二）--进行数字签名
 使用SignTool对软件安装包进行数字签名（一）--制作证书
 三角形相关算法--求解三角形顶点坐标
 子网掩码与子网个数、主机地址个数的关系
 pgsql中的lateral使用小结
 Git中rebase失败了如何进行恢复
 灰度发布
 go 中的WaitGroup
pgsql中json格式数组查询结果变成了字符串
 Go中的unsafe

原文地址：https://www.cnblogs.com/luxiaodai/p/9257021.html