zoukankan      html  css  js  c++  java
  • Linux3.10.0块IO子系统流程(3)-- SCSI策略例程

    很长时间以来,Linux块设备使用了一种称为“蓄流/泄流”(plugging/unplugging)的技术来改进吞吐率。简单而言,这种工作方式类似浴盆排水系统的塞子。当IO被提交时,它被储存在一个队列,稍后的某个时间,我们才允许IO从队列派发出去。之所以这么做是为IO尽可能做合并和排序。

      1 static void scsi_request_fn(struct request_queue *q)
      2 {
      3     struct scsi_device *sdev = q->queuedata;
      4     struct Scsi_Host *shost;
      5     struct scsi_cmnd *cmd;
      6     struct request *req;
      7     if(!get_device(&sdev->sdev_gendev))
      8         /* We must be tearing the block queue down already */
      9         return;
     10     /*
     11      * To start with, we keep looping until the queue is empty, or until
     12      * the host is no longer able to accept any more requests.
     13      */
     14     shost = sdev->host;
     15     for (;;) {
     16         int rtn;
     17         /*
     18          * get next queueable request.  We do this early to make sure
     19          * that the request is fully prepared even if we cannot
     20          * accept it.
     21          */
     22         req = blk_peek_request(q);    // 获得下一个可排队的请求,如果没有请求或者现在还不能想SCSI设备发送请求,则退出循环
     23         if (!req || !scsi_dev_queue_ready(q, sdev))
     24             break;
     25         /* 如果设备已经离线,则输出错误消息, 调用scsi_kill_request函数释放请求,并以此方式处理后面所有的请求 */
     26         if (unlikely(!scsi_device_online(sdev))) {
     27             sdev_printk(KERN_ERR, sdev,
     28                     "rejecting I/O to offline device
    ");
     29             scsi_kill_request(req, q);
     30             continue;
     31         }
     32         /*
     33          * Remove the request from the request list.
     34          * 如果队列不是使用generic tag queueing,并且没有为请求启动tagged操作,调用blk_start_request开始由驱动处理请求,这个函数将请求从队列中取出,为它启动超时定时器
     35          */
     36         if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))    // 
     37             blk_start_request(req);
     38         sdev->device_busy++;
     39         spin_unlock(q->queue_lock);
     40         /* 从块设备驱动层请求描述符的special域获得SCSI命令描述符,这是在之前的blk_peek_request函数中调用请求队列的prep_rq_fn回调函数准备的 */
     41         cmd = req->special;
     42         if (unlikely(cmd == NULL)) {
     43             printk(KERN_CRIT "impossible request in %s.
    "
     44                      "please mail a stack trace to "
     45                      "linux-scsi@vger.kernel.org
    ",
     46                      __func__);
     47             blk_dump_rq_flags(req, "foo");
     48             BUG();
     49         }
     50         spin_lock(shost->host_lock);
     51         /*
     52          * We hit this when the driver is using a host wide
     53          * tag map. For device level tag maps the queue_depth check
     54          * in the device ready fn would prevent us from trying
     55          * to allocate a tag. Since the map is a shared host resource
     56          * we add the dev to the starved list so it eventually gets
     57          * a run when a tag is freed.
     58          */
     59         if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {
     60             if (list_empty(&sdev->starved_entry))
     61                 list_add_tail(&sdev->starved_entry,
     62                           &shost->starved_list);
     63             goto not_ready;
     64         }
     65         if (!scsi_target_queue_ready(shost, sdev))
     66             goto not_ready;
     67         if (!scsi_host_queue_ready(q, shost, sdev))
     68             goto not_ready;
     69         scsi_target(sdev)->target_busy++;
     70         shost->host_busy++;
     71         /*
     72          * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
     73          *        take the lock again.
     74          */
     75         spin_unlock_irq(shost->host_lock);
     76         /*
     77          * Finally, initialize any error handling parameters, and set up the timers for timeouts.
     78          * 初始化错误处理参数, 设置超时定时器
     79          */
     80         scsi_init_cmd_errh(cmd);
     81         /*
     82          * Dispatch the command to the low-level driver.
     83          * 将命令派发到底层驱动
     84          */
     85         rtn = scsi_dispatch_cmd(cmd);
     86         spin_lock_irq(q->queue_lock);
     87         if (rtn)
     88             goto out_delay;
     89     }
     90     goto out;
     91 
     92 not_ready:
     93     spin_unlock_irq(shost->host_lock);
     94     /*
     95      * lock q, handle tag, requeue req, and decrement device_busy. We
     96      * must return with queue_lock held.
     97      *
     98      * Decrementing device_busy without checking it is OK, as all such
     99      * cases (host limits or settings) should run the queue at some
    100      * later time.
    101      */
    102     spin_lock_irq(q->queue_lock);
    103     blk_requeue_request(q, req);
    104     sdev->device_busy--;
    105 out_delay:
    106     if (sdev->device_busy == 0)
    107         blk_delay_queue(q, SCSI_QUEUE_DELAY);
    108 out:
    109     /* must be careful here...if we trigger the ->remove() function
    110      * we cannot be holding the q lock */
    111     spin_unlock_irq(q->queue_lock);
    112     put_device(&sdev->sdev_gendev);
    113     spin_lock_irq(q->queue_lock);
    114 }

    blk_peek_request从请求队列“顶部”取得下一个请求。函数的实现就是一个大循环,每次调用__elv_next_request从电梯队列中取出一个请求进行处理

      1 /**
      2 * blk_peek_request - peek at the top of a request queue
      3 * @q: request queue to peek at
      4 *
      5 * Description:
      6 *     Return the request at the top of @q.  The returned request
      7 *     should be started using blk_start_request() before LLD starts
      8 *     processing it.
      9 *
     10 * Return:
     11 *     Pointer to the request at the top of @q if available.  Null
     12 *     otherwise.
     13 *
     14 * Context:
     15 *     queue_lock must be held.
     16 */
     17 struct request *blk_peek_request(struct request_queue *q)
     18 {
     19     struct request *rq;
     20     int ret;
     21 
     22     while ((rq = __elv_next_request(q)) != NULL) {
     23 
     24         rq = blk_pm_peek_request(q, rq);
     25         if (!rq)
     26             break;
     27         /* 请求可能是全新的或者是由于暂时不能处理而重新排入队列的,对于后一种情况,必然设置了REQ_STARTED标志。
     28           * 换句话说,如果没有该标志,则表示第一次看见此请求,如果请求被插入还需要排序,则调用elv_activate_rq函数确定合适执行该请求
     29           */
     30         if (!(rq->cmd_flags & REQ_STARTED)) {
     31             /*
     32              * This is the first time the device driver
     33              * sees this request (possibly after
     34              * requeueing).  Notify IO scheduler.
     35              */
     36             if (rq->cmd_flags & REQ_SORTED)
     37                 elv_activate_rq(q, rq);
     38 
     39             /*
     40              * just mark as started even if we don't start
     41              * it, a request that has been delayed should
     42              * not be passed by new incoming requests
     43              */
     44             rq->cmd_flags |= REQ_STARTED;
     45             trace_block_rq_issue(q, rq);
     46         }
     47         /* 配合IO调度器 */
     48         if (!q->boundary_rq || q->boundary_rq == rq) {
     49             q->end_sector = rq_end_sector(rq);
     50             q->boundary_rq = NULL;
     51         }
     52 
     53         /* 如果请求队列设置了REQ_DONTPREP,表明不需要准备SCSI命令,退出循环,向调用者返回这个请求 */
     54         if (rq->cmd_flags & REQ_DONTPREP)
     55             break;
     56 
     57         /* 
     58           * 如果请求队列的dma_drain_size不为0,说明存在“过剩DMA”问题,这种情况下,需要为请求增加一个额外的段
     59           * 以便将来在聚散列表后追加“抽干缓冲区”
     60           */
     61         if (q->dma_drain_size && blk_rq_bytes(rq)) {
     62             /*
     63              * make sure space for the drain appears we
     64              * know we can do this because max_hw_segments
     65              * has been adjusted to be one fewer than the
     66              * device can handle
     67              */
     68             rq->nr_phys_segments++;
     69         }
     70         /* 
     71           * 如果没有定义 prep_rq_fn回调,则返回
     72           * 否则调用回调为请求准备SCSI命令描述符,它有三种返回值:
     73           *     BLKPREP_OK:表示命令初期准备成功
     74           *     BLKPREP_DEFER:表示暂时还不能继续处理,需要将命令重新排入队列
     75           *     BLKPREP_KILL:该请求没办法继续处理,上上层报告IO错误,这里不退出循环,而是继续尝试下一个请求
     76           */
     77         if (!q->prep_rq_fn)
     78             break;
     79 
     80         ret = q->prep_rq_fn(q, rq);
     81         if (ret == BLKPREP_OK) {
     82             break;
     83         } else if (ret == BLKPREP_DEFER) {
     84             /*
     85              * the request may have been (partially) prepped.
     86              * we need to keep this request in the front to
     87              * avoid resource deadlock.  REQ_STARTED will
     88              * prevent other fs requests from passing this one.
     89              */
     90             if (q->dma_drain_size && blk_rq_bytes(rq) &&
     91                 !(rq->cmd_flags & REQ_DONTPREP)) {
     92                 /*
     93                  * remove the space for the drain we added
     94                  * so that we don't add it again
     95                  */
     96                 --rq->nr_phys_segments;
     97             }
     98 
     99             rq = NULL;
    100             break;
    101         } else if (ret == BLKPREP_KILL) {
    102             rq->cmd_flags |= REQ_QUIET;
    103             /*
    104              * Mark this request as started so we don't trigger
    105              * any debug logic in the end I/O path.
    106              */
    107             blk_start_request(rq);
    108             __blk_end_request_all(rq, -EIO);
    109         } else {
    110             printk(KERN_ERR "%s: bad return=%d
    ", __func__, ret);
    111             break;
    112         }
    113     }
    114 
    115     return rq;
    116 }
    请求队列中的prep_rq_fn回调函数实现了从请求构造SCSI命令的方法,prep_rq_fn回调函数关键有两个任务:
    1. 构造命令描述块
    2. 如果需要的话为数据传输准备聚散列表
    命令描述块和聚散列表都被封装到SCSI命令描述符中,我们知道,请求至少有两个来源
    1. 来自上层bio
    2. 来自SCSI公共服务层
    在刚找到SCSI设备为其初始化请求队列时,这个回调函数被设置为scsi_prep_fn
     
     1 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
     2 {
     3     struct request_queue *q;
     4 
     5     q = __scsi_alloc_queue(sdev->host, scsi_request_fn);
     6     if (!q)
     7         return NULL;
     8 
     9     blk_queue_prep_rq(q, scsi_prep_fn);
    10     blk_queue_softirq_done(q, scsi_softirq_done);
    11     blk_queue_rq_timed_out(q, scsi_times_out);
    12     blk_queue_lld_busy(q, scsi_lld_busy);
    13     return q;
    14 }
    15 
    16 /**
    17 * blk_queue_prep_rq - set a prepare_request function for queue
    18 * @q:        queue
    19 * @pfn:    prepare_request function
    20 *
    21 * It's possible for a queue to register a prepare_request callback which
    22 * is invoked before the request is handed to the request_fn. The goal of
    23 * the function is to prepare a request for I/O, it can be used to build a
    24 * cdb from the request data for instance.
    25 *
    26 */
    27 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
    28 {
    29     q->prep_rq_fn = pfn;
    30 }
    初始化回调
    如果SCSI设备被高层驱动绑定,这个回调函数会被修改,例如,在sd_probe中被设置成sd_prep_fn
     
     1 static void sd_probe_async(void *data, async_cookie_t cookie)
     2 {
     3     struct scsi_disk *sdkp = data;
     4     struct scsi_device *sdp;
     5     struct gendisk *gd;
     6     u32 index;
     7     struct device *dev;
     8 
     9     sdp = sdkp->device;
    10     gd = sdkp->disk;
    11     index = sdkp->index;
    12     dev = &sdp->sdev_gendev;
    13 
    14     gd->major = sd_major((index & 0xf0) >> 4);
    15     gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
    16     gd->minors = SD_MINORS;
    17 
    18     gd->fops = &sd_fops;
    19     gd->private_data = &sdkp->driver;
    20     gd->queue = sdkp->device->request_queue;
    21 
    22     /* defaults, until the device tells us otherwise */
    23     sdp->sector_size = 512;
    24     sdkp->capacity = 0;
    25     sdkp->media_present = 1;
    26     sdkp->write_prot = 0;
    27     sdkp->cache_override = 0;
    28     sdkp->WCE = 0;
    29     sdkp->RCD = 0;
    30     sdkp->ATO = 0;
    31     sdkp->first_scan = 1;
    32     sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;
    33 
    34     sd_revalidate_disk(gd);
    35 
    36     blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);
    37     blk_queue_unprep_rq(sdp->request_queue, sd_unprep_fn);
    38 
    39     gd->driverfs_dev = &sdp->sdev_gendev;
    40     gd->flags = GENHD_FL_EXT_DEVT;
    41     if (sdp->removable) {
    42         gd->flags |= GENHD_FL_REMOVABLE;
    43         gd->events |= DISK_EVENT_MEDIA_CHANGE;
    44     }
    45 
    46     add_disk(gd);
    47     if (sdkp->capacity)
    48         sd_dif_config_host(sdkp);
    49 
    50     sd_revalidate_disk(gd);
    51 
    52     sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk
    ",
    53           sdp->removable ? "removable " : "");
    54     blk_pm_runtime_init(sdp->request_queue, dev);
    55     scsi_autopm_put_device(sdp);
    56     put_device(&sdkp->dev);
    57 }
    初始化回调

    在前一种情况下,SCSI设备只能处理来自SCSI公共服务层的请求,后一种情况下,SCSI命令不仅能处理来自SCSI公共服务层的请求,还能够处理来自上层的bio请求,分析见下一节

     
     
  • 相关阅读:
    start tag, end tag issues in IE7, particularly in xslt transformation
    用SandCastle为注释生成chm文档
    Firebug
    架构的重点
    Linux Shell常用技巧(十) 管道组合
    Linux JDK升级
    Linux Shell常用技巧(十二) Shell编程
    Packet Tracer 5.0实验(一) 交换机的基本配置与管理
    Linux Shell常用技巧(六) sort uniq tar split
    Linux Shell常用技巧(二) grep
  • 原文地址:https://www.cnblogs.com/luxiaodai/p/9266309.html
Copyright © 2011-2022 走看看