zoukankan      html  css  js  c++  java
  • linux内核源码阅读之facebook硬盘加速flashcache之五


    正常流程到flashcache_map的1623行或1625行,按顺序先看读流程:
    1221static void
    1222flashcache_read(struct cache_c *dmc, struct bio *bio)
    1223{
    1224	int index;
    1225	int res;
    1226	struct cacheblock *cacheblk;
    1227	int queued;
    1228
    1229	DPRINTK("Got a %s for %llu  %u bytes)",
    1230	        (bio_rw(bio) == READ ? "READ":"READA"), 
    1231		bio->bi_sector, bio->bi_size);
    1232
    1233	spin_lock_irq(&dmc->cache_spin_lock);
    1234	res = flashcache_lookup(dmc, bio, &index);
    1235	/* 
    1236	 * Handle Cache Hit case first.
    1237	 * We need to handle 2 cases, BUSY and !BUSY. If BUSY, we enqueue the
    1238	 * bio for later.
    1239	 */
    1240	if (res > 0) {
    1241		cacheblk = &dmc->cache[index];
    1242		if ((cacheblk->cache_state & VALID) && 
    1243		    (cacheblk->dbn == bio->bi_sector)) {
    1244			flashcache_read_hit(dmc, bio, index);
    1245			return;
    1246		}
    1247	}
    1248	/*
    1249	 * In all cases except for a cache hit (and VALID), test for potential 
    1250	 * invalidations that we need to do.
    1251	 */
    1252	queued = flashcache_inval_blocks(dmc, bio);
    1253	if (queued) {
    1254		if (unlikely(queued < 0))
    1255			flashcache_bio_endio(bio, -EIO);
    1256		spin_unlock_irq(&dmc->cache_spin_lock);
    1257		return;
    1258	}
    1259	if (res == -1 || flashcache_uncacheable(dmc)) {
    1260		/* No room or non-cacheable */
    1261		spin_unlock_irq(&dmc->cache_spin_lock);
    1262		DPRINTK("Cache read: Block %llu(%lu):%s",
    1263			bio->bi_sector, bio->bi_size, "CACHE MISS & NO ROOM");
    1264		if (res == -1)
    1265			flashcache_clean_set(dmc, hash_block(dmc, bio->bi_sector));
    1266		/* Start uncached IO */
    1267		flashcache_start_uncached_io(dmc, bio);
    1268		return;
    1269	}
    1270	/* 
    1271	 * (res == INVALID) Cache Miss 
    1272	 * And we found cache blocks to replace
    1273	 * Claim the cache blocks before giving up the spinlock
    1274	 */
    1275	if (dmc->cache[index].cache_state & VALID)
    1276		dmc->replace++;
    1277	else
    1278		dmc->cached_blocks++;
    1279	dmc->cache[index].cache_state = VALID | DISKREADINPROG;
    1280	dmc->cache[index].dbn = bio->bi_sector;
    1281	spin_unlock_irq(&dmc->cache_spin_lock);
    1282
    1283	DPRINTK("Cache read: Block %llu(%lu), index = %d:%s",
    1284		bio->bi_sector, bio->bi_size, index, "CACHE MISS & REPLACE");
    1285	flashcache_read_miss(dmc, bio, index);
    1286}

    我非常喜欢flashcache这种小资的感觉,每个函数都比较短,大部分都没有超过100行的。不像neil大哥写的md代码的函数动则几百行,上千行,总是望啊望啊望不到边。当然不是说我不喜欢neil的代码,实际上他的代码是非常非常好的,因为md代码已经有十多年的历史了,大的框架仍然没有太大的改变,仍能保持那么优雅已经是十分难得了。最新版本的md还加了许多新功能,像bad block和replacement机制都是非常实用的。所以对于一名优秀的软件工程师来说,并不在于写了多少行代码,而是编写的软件运行在多少台机器上,为用户创造了多少价值。
    第1234行是查找bio是否命中,flashcache_lookup函数我们在之前的文章里已经分析过了。第1244行是命中的情况,我们跟进看看
    1119static void
    1120flashcache_read_hit(struct cache_c *dmc, struct bio* bio, int index)
    1121{
    1122	struct cacheblock *cacheblk;
    1123	struct pending_job *pjob;
    1124
    1125	cacheblk = &dmc->cache[index];
    1126	if (!(cacheblk->cache_state & BLOCK_IO_INPROG) && (cacheblk->head == NULL)) {
    1127		struct kcached_job *job;
    1128			
    1129		cacheblk->cache_state |= CACHEREADINPROG;
    1130		dmc->read_hits++;
    1131		spin_unlock_irq(&dmc->cache_spin_lock);
    1132		DPRINTK("Cache read: Block %llu(%lu), index = %d:%s",
    1133			bio->bi_sector, bio->bi_size, index, "CACHE HIT");
    1134		job = new_kcached_job(dmc, bio, index);
    1135		if (unlikely(sysctl_flashcache_error_inject & READ_HIT_JOB_ALLOC_FAIL)) {
    1136			if (job)
    1137				flashcache_free_cache_job(job);
    1138			job = NULL;
    1139			sysctl_flashcache_error_inject &= ~READ_HIT_JOB_ALLOC_FAIL;
    1140		}
    1141		if (unlikely(job == NULL)) {
    1142			/* 
    1143			 * We have a read hit, and can't allocate a job.
    1144			 * Since we dropped the spinlock, we have to drain any 
    1145			 * pending jobs.
    1146			 */
    1147			DMERR("flashcache: Read (hit) failed ! Can't allocate memory for cache IO, block %lu", 
    1148			      cacheblk->dbn);
    1149			flashcache_bio_endio(bio, -EIO);
    1150			spin_lock_irq(&dmc->cache_spin_lock);
    1151			flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
    1152			cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
    1153			spin_unlock_irq(&dmc->cache_spin_lock);
    1154		} else {
    1155			job->action = READCACHE; /* Fetch data from cache */
    1156			atomic_inc(&dmc->nr_jobs);
    1157			dmc->ssd_reads++;
    1158			dm_io_async_bvec(1, &job->cache, READ,
    1159					 bio->bi_io_vec + bio->bi_idx,
    1160					 flashcache_io_callback, job);
    1161			flashcache_unplug_device(dmc->cache_dev->bdev);
    1162		}
    1163	} else {
    1164		pjob = flashcache_alloc_pending_job(dmc);
    1165		if (unlikely(sysctl_flashcache_error_inject & READ_HIT_PENDING_JOB_ALLOC_FAIL)) {
    1166			if (pjob) {
    1167				flashcache_free_pending_job(pjob);
    1168				pjob = NULL;
    1169			}
    1170			sysctl_flashcache_error_inject &= ~READ_HIT_PENDING_JOB_ALLOC_FAIL;
    1171		}
    1172		if (pjob == NULL)
    1173			flashcache_bio_endio(bio, -EIO);
    1174		else
    1175			flashcache_enq_pending(dmc, bio, index, READCACHE, pjob);
    1176		spin_unlock_irq(&dmc->cache_spin_lock);
    1177	}
    1178}

    首先获取这个cache块管理结构,第1126行判断cache块不忙的情况,进入1129行设置状态为从cache读,第1134行创建一个kcached_job,在1141行申请kcached_job失败时就对bio返回失败。申请成功到1155行将kcached_job设置为READCACHE,再调用dm_io_async_bvec下发请求,当请求回来时就会调用这里设置的回调函数flashcache_io_callback。再继续看读SSD返回是怎么处理的?我们只看该函数是对READCACHE的处理:
    151	case READCACHE:
    152		DPRINTK("flashcache_io_callback: READCACHE %d",
    153			index);
    154		spin_lock_irqsave(&dmc->cache_spin_lock, flags);
    155		if (unlikely(sysctl_flashcache_error_inject & READCACHE_ERROR)) {
    156			job->error = error = -EIO;
    157			sysctl_flashcache_error_inject &= ~READCACHE_ERROR;
    158		}
    159		VERIFY(cacheblk->cache_state & CACHEREADINPROG);
    160		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
    161		if (unlikely(error))
    162			dmc->ssd_read_errors++;
    163#ifdef FLASHCACHE_DO_CHECKSUMS
    164		if (likely(error == 0)) {
    165			if (flashcache_validate_checksum(job)) {
    166				DMERR("flashcache_io_callback: Checksum mismatch at disk offset %lu", 
    167				      job->disk.sector);
    168				error = -EIO;
    169			}
    170		}
    171#endif
    172		flashcache_bio_endio(bio, error);
    173		break;		       
    

    实际上真正有意义的就是第172行,将请求done回去了。这样我们就完成的一次读命中的处理。
    读命中处理还有一种情况就是第1163行cache块忙的情况,这个时候就申请一个pending_job,挂到cache块上,等cache块上一个请求回来的时候调度。

    继续回到flashcache_read,看不命中的情况。
    到第1259行flashcache_uncacheable函数是管理命令相关的,加了黑名单之后会跳过cache,直接下发到磁盘。
    到1264行res是flashcache_lookup返回的,为-1就表示获取不到可用的cache块,这时就调用flashcache_clean_set清除一下脏块。但获取不到cache块并不是说请求就结束了,还得下发到磁盘,1267行flashcache_start_uncached_io将请求直接下发到磁盘。
    第1275行到1278行是统计信息,根据这些信息可以知道flashcache的运行状况,用于flashcache的性能优化。
    接着1279行设置cache块的状态,1280行设置cache块对应磁盘上的扇区,最后调用flashcache_read_miss下发请求:
    1180static void
    1181flashcache_read_miss(struct cache_c *dmc, struct bio* bio,
    1182               int index)
    1183{
    1184     struct kcached_job *job;
    1185     struct cacheblock *cacheblk = &dmc->cache[index];
    1186
    1187     job = new_kcached_job(dmc, bio, index);
    1188     if (unlikely(sysctl_flashcache_error_inject & READ_MISS_JOB_ALLOC_FAIL)) {
    1189          if (job)
    1190               flashcache_free_cache_job(job);
    1191          job = NULL;
    1192          sysctl_flashcache_error_inject &= ~READ_MISS_JOB_ALLOC_FAIL;
    1193     }
    1194     if (unlikely(job == NULL)) {
    1195          /* 
    1196          * We have a read miss, and can't allocate a job.
    1197          * Since we dropped the spinlock, we have to drain any 
    1198          * pending jobs.
    1199          */
    1200          DMERR("flashcache: Read (miss) failed ! Can't allocate memory for cache IO, block %lu", 
    1201                cacheblk->dbn);
    1202          flashcache_bio_endio(bio, -EIO);
    1203          spin_lock_irq(&dmc->cache_spin_lock);
    1204          dmc->cached_blocks--;
    1205          cacheblk->cache_state &= ~VALID;
    1206          cacheblk->cache_state |= INVALID;
    1207          flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
    1208          cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
    1209          spin_unlock_irq(&dmc->cache_spin_lock);
    1210     } else {
    1211          job->action = READDISK; /* Fetch data from the source device */
    1212          atomic_inc(&dmc->nr_jobs);
    1213          dmc->disk_reads++;
    1214          dm_io_async_bvec(1, &job->disk, READ,
    1215                    bio->bi_io_vec + bio->bi_idx,
    1216                    flashcache_io_callback, job);
    1217          flashcache_clean_set(dmc, index / dmc->assoc);
    1218     }
    1219}

    在第1187行申请了一个kcached_job,申请成功就到1211行,设置job->action=READDISK,调用dm_io_async_bvec直接从磁盘读取数据。接着调用flashcache_clean_set检查一下水位线。再看这里读磁盘的回调函数flashcache_io_callback,按理说读完磁盘就可以直接向上层返回数据,但这里还要把数据缓存起来之后再返回。
    113void 
    114flashcache_io_callback(unsigned long error, void *context)
    115{
    116     struct kcached_job *job = (struct kcached_job *) context;
    117     struct cache_c *dmc = job->dmc;
    118     struct bio *bio;
    119     unsigned long flags;
    120     int index = job->index;
    121     struct cacheblock *cacheblk = &dmc->cache[index];
    122
    123     VERIFY(index != -1);          
    124     bio = job->bio;
    125     VERIFY(bio != NULL);
    126     if (error)
    127          DMERR("flashcache_io_callback: io error %ld block %lu action %d", 
    128                error, job->disk.sector, job->action);
    129     job->error = error;
    130     switch (job->action) {
    131     case READDISK:
    132          DPRINTK("flashcache_io_callback: READDISK  %d",
    133               index);
    134          spin_lock_irqsave(&dmc->cache_spin_lock, flags);
    135          if (unlikely(sysctl_flashcache_error_inject & READDISK_ERROR)) {
    136               job->error = error = -EIO;
    137               sysctl_flashcache_error_inject &= ~READDISK_ERROR;
    138          }
    139          VERIFY(cacheblk->cache_state & DISKREADINPROG);
    140          spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
    141          if (likely(error == 0)) {
    142               /* Kick off the write to the cache */
    143               job->action = READFILL;
    144               flashcache_enqueue_readfill(dmc, job);
    145               return;
    146          } else {
    147               dmc->disk_read_errors++;               
    148               flashcache_bio_endio(bio, error);
    149          }
    150          break;
              
    174     case READFILL:
    175          DPRINTK("flashcache_io_callback: READFILL %d",
    176               index);
    177          spin_lock_irqsave(&dmc->cache_spin_lock, flags);
    178          if (unlikely(sysctl_flashcache_error_inject & READFILL_ERROR)) {
    179               job->error = error = -EIO;
    180               sysctl_flashcache_error_inject &= ~READFILL_ERROR;
    181          }
    182          if (unlikely(error))
    183               dmc->ssd_write_errors++;
    184          VERIFY(cacheblk->cache_state & DISKREADINPROG);
    185          spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
    186          flashcache_bio_endio(bio, error);
    187          break;

    归纳一下读不命中的流程:
    1)创建一个kcached_job,直接下发到磁盘
    2)读磁盘返回到flashcache_io_callback,到131行下发READFILL,将读出来的数据写到缓存中
    3)写缓存成功并返回到flashcache_io_callback,到174行将数据返回给上层
    到这里已经将读流程简单过了一遍,下一个小节介绍写流程。
  • 相关阅读:
    svn command line tag
    MDbg.exe(.NET Framework 命令行调试程序)
    Microsoft Web Deployment Tool
    sql server CI
    VS 2010 One Click Deployment Issue “Application Validation did not succeed. Unable to continue”
    mshtml
    大厂程序员站错队被架空,只拿着五折工资!苟活和离职,如何选择?
    揭秘!Windows 为什么会蓝屏?微软程序员竟说是这个原因...
    喂!千万别忘了这个C语言知识!(~0 == -1 问题)
    Linux 比 Windows 更好,谁反对?我有13个赞成理由
  • 原文地址:https://www.cnblogs.com/keanuyaoo/p/3327665.html
Copyright © 2011-2022 走看看