crush_do_rule中,用了一个scratch空间来完成item的搜索。
scratch空间总共有3个max_result这么大,并且按照max_result长度划分为三个部分(下图中的a、b、c,其中c只在recursive_to_leaf时用到,本文不涉及)。
a、b两个部分就用来生成result。a、b两个部分分别由o、w两个数组指针来指向,在每完成一个select step后,o、w互换指向的位置,上一次的o将变成本次的w,成为本次step遍历的对象,而上次的w将变成本次的o用于存放本次step的output items。
以Sage Weil论文中的例子来演示此过程:
take(root) ->root
select(1, row) ->row2 //图例step1
select(3, cabinet) ->cab21 cab23 cab24 //图例step2
select(1, disk) ->disk2107 disk2313 disk2437 //图例step3
emit //图例step4 (从scratch数组中将选中的item拷贝到result数组中)
简化后的骨干代码如下:
1、省略了itemid的合法性校验代码
2、仅考虑firstn的情况(即仅考虑replica策略)
3、不考虑recursive_to_leaf的情况(此为工程优化,非Sage Weil论文的核心内容)
int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, int *scratch) { int osize, wsize = 0; int *w, *o; w = scratch; o = scratch + result_max; struct crush_rule *rule = map->rules[ruleno]; for (__u32 step = 0; step < rule->len; step++) { struct crush_rule_step *curstep = &rule->steps[step]; switch (curstep->op) { case CRUSH_RULE_TAKE: w[0] = curstep->arg1; wsize = 1; break; //Elar: //1. only consider fistn's situation //2. ignore recurse_to_leaf situation case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: /* reset output */ osize = 0; for (int i = 0; i < wsize; i++) { int numrep = curstep->arg1; int outpos = 0; int type = curstep->arg2; int bno = -1 - w[i];//Elar: get bucketId struct crush_bucket *bucket = map->buckets[bno]; osize += crush_choose_firstn(map, bucket, weight, weight_max, x, numrep, type, o+osize, outpos); } /* swap o and w arrays */ int *tmp = o; o = w; w = tmp; wsize = osize; break; case CRUSH_RULE_EMIT: int i = 0; result_len = 0; for (; i < wsize && result_len < result_max; i++) { result[result_len] = w[i]; result_len++; } wsize = 0; break; default: break; } } return result_len; } static int crush_choose_firstn(const struct crush_map *map, struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out) { for (int rep = 0; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ unsigned int ftotal = 0; bool skip_rep = 0; do { unsigned int flocal = 0; bool retry_descent = false; struct crush_bucket *in = bucket; do { bool retry_bucket = false; int r = rep + parent_r; /* r' = r + f_total */ r += ftotal; /* bucket choose */ int item = crush_bucket_choose(in, x, r); int itemtype; if (item < 0)//Elar: if item is a bucket, then get its type itemtype = map->buckets[-1-item]->type; else//Elar: if item is a device, then its type=0 itemtype = 0; //Elar: if this item's type is not what we expected, then keep going until we get an match one! if (itemtype != type) { in = map->buckets[-1-item]; retry_bucket = 1; continue; } // Elar: check if item has already been in the output array bool collide = false; for (i = 0; i < outpos; i++) { if (out[i] == item) { collide = true; break; } } bool reject = false; if (itemtype == 0){ //Elar: check if this item has been marked as "out" reject = is_out(map, weight,weight_max,item, x); }else{ reject = false; } reject: if (reject || collide) { ftotal++; flocal++; if still can try locally(with in the same bucket, try other items) retry_bucket = true; else if still can try descent(parent's or grandparent's sibling buckets) /* then retry descent */ retry_descent = true; else /* else give up */ skip_rep = true; } } while (true == retry_bucket); } while (true == retry_descent); if (true == skip_rep) { continue; } out[outpos] = item; outpos++; } return outpos; }
完整代码请移步git: