zoukankan      html  css  js  c++  java
  • Memcached源码分析之slabs.c

    1. #include "memcached.h"
    2. #include <sys/stat.h>
    3. #include <sys/socket.h>
    4. #include <sys/signal.h>
    5. #include <sys/resource.h>
    6. #include <fcntl.h>
    7. #include <netinet/in.h>
    8. #include <errno.h>
    9. #include <stdlib.h>
    10. #include <stdio.h>
    11. #include <string.h>
    12. #include <assert.h>
    13. #include <pthread.h>
    14.  
    15. typedef struct {
    16. unsigned int size;      /* sizes of items */     //item或者说chunk的大小
    17. unsigned int perslab;   /* how many items per slab */ //每个slab有多少个item,slab又称“页”
    18.  
    19. /**
    20. 当前slabclass的空闲item链表,也是可用item链表,当前slabclass一切可以用的内存空间都在此,
    21. 这里是内存分配的入口,分配内存的时候都是在这个链表上挤一个出去。
    22.  
    23. ps:memcached的新版本才开始把slots作为“所有空闲的item链接”的用途,以前的版本slots链表保存的是“回收的item”的意思,
    24. 而旧版本新分配的slab,是用end_page_ptr指针及end_page_free来控制,此版本已不用。
    25. */
    26. void *slots;           /* list of item ptrs */
    27. unsigned int sl_curr;   /* total free items in list */  //当前slabclass还剩多少空闲的item,即上面的slots数
    28.  
    29. unsigned int slabs;     /* how many slabs were allocated for this class */ //这个slabclass分配了多少个slab了
    30.  
    31. /**
    32. slab_list是这个slabclass下的slabs列表,逻辑上是一个数组,每个元素是一个slab指针。
    33. list_size是slab_list的元素个数。
    34. 注意这个list_size和上面的slabs的不同:
    35. 由于slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组元素的个数,代表slab_list的空间大小。
    36. slabs代表已经分配出去的slabs数,list_size则代表可以有多少个slabs数
    37. 所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
    38.  
    39. */
    40. void **slab_list;       /* array of slab pointers */
    41. unsigned int list_size; /* size of prev array */
    42.  
    43. unsigned int killing;  /* index+1 of dying slab, or zero if none */
    44. size_t requested; /* The number of requested bytes */
    45. } slabclass_t;
    46.  
    47. static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
    48. static size_t mem_limit = 0; //内存上限
    49. static size_t mem_malloced = 0; //已分配的内存
    50. static int power_largest;
    51.  
    52. static void *mem_base = NULL; //预分配的内存空间
    53. static void *mem_current = NULL;
    54. static size_t mem_avail = 0;
    55.  
    56. static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
    57. static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
    58.  
    59. static int do_slabs_newslab(const unsigned int id);
    60. static void *memory_allocate(size_t size);
    61. static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
    62.  
    63. static void slabs_preallocate (const unsigned int maxslabs);
    64.  
    65. //根据item大小找到合适的slabclass
    66. unsigned int slabs_clsid(const size_t size) {
    67. int res = POWER_SMALLEST;
    68.  
    69. if (size == 0)
    70. return 0;
    71. while (size > slabclass[res].size)
    72. if (res++ == power_largest)     /* won't fit in the biggest slab */
    73. return 0;
    74. return res;
    75. }
    76.  
    77. /**
    78. 初始化slabs,这里会对一些内存管理进行初始化
    79. */
    80. void slabs_init(const size_t limit, const double factor, const bool prealloc) {
    81. int i = POWER_SMALLEST - 1;
    82. unsigned int size = sizeof(item) + settings.chunk_size;
    83.  
    84. mem_limit = limit; //这个limit就是启动时候用户设置的-m xx中的xx,最大的内存上限
    85.  
    86. if (prealloc) {
    87. /**
    88. 如果用户开启了预分配,则先把上限的内存先分配出来,放到mem_base全局变量中。
    89. 所以这个时候服务就拥有了一大坨内存,以后要分配的内存都是从这一坨里面割下来。
    90. */
    91. mem_base = malloc(mem_limit);
    92. if (mem_base != NULL) {
    93. mem_current = mem_base;
    94. mem_avail = mem_limit;
    95. } else {
    96. fprintf(stderr, "Warning: Failed to allocate requested memory in"
    97. " one large chunk. Will allocate in smaller chunks ");
    98. }
    99. }
    100.  
    101. //下面是初始化各个slabclass对象
    102. memset(slabclass, 0, sizeof(slabclass));
    103.  
    104. while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
    105. /* Make sure items are always n-byte aligned */
    106. if (size % CHUNK_ALIGN_BYTES)
    107. size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
    108.  
    109. slabclass[i].size = size;
    110. slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
    111. size *= factor;
    112. if (settings.verbose > 1) {
    113. fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u ",
    114. i, slabclass[i].size, slabclass[i].perslab);
    115. }
    116. }
    117.  
    118. power_largest = i;
    119. slabclass[power_largest].size = settings.item_size_max;
    120. slabclass[power_largest].perslab = 1;
    121. if (settings.verbose > 1) {
    122. fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u ",
    123. i, slabclass[i].size, slabclass[i].perslab);
    124. }
    125.  
    126. {
    127. char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
    128. if (t_initial_malloc) {
    129. mem_malloced = (size_t)atol(t_initial_malloc);
    130. }
    131.  
    132. }
    133.  
    134. if (prealloc) {
    135. slabs_preallocate(power_largest);
    136. }
    137. }
    138.  
    139. /**
    140. 内存预分配,如果用户开启了预分配,则会调用此方法,先从mem_base为分每个slabclass割一个slab大小下来。
    141. */
    142. static void slabs_preallocate (const unsigned int maxslabs) {
    143. int i;
    144. unsigned int prealloc = 0;
    145.  
    146. for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) {
    147. if (++prealloc > maxslabs)
    148. return;
    149. if (do_slabs_newslab(i) == 0) {
    150. fprintf(stderr, "Error while preallocating slab memory! "
    151. "If using -L or other prealloc options, max memory must be "
    152. "at least %d megabytes. ", power_largest);
    153. exit(1);
    154. }
    155. }
    156.  
    157. }
    158.  
    159. static int grow_slab_list (const unsigned int id) {
    160. slabclass_t *p = &slabclass[id];
    161. /**
    162. p->slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组分配的空间。
    163. p->slabs代表已经分配出去的slabs数
    164. 而p->list_size代表可以用多少个slabs数
    165. 所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
    166. */
    167. if (p->slabs == p->list_size) {
    168. size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
    169. void *new_list = realloc(p->slab_list, new_size * sizeof(void *)); //
    170. if (new_list == 0) return 0;
    171. p->list_size = new_size;
    172. p->slab_list = new_list;
    173. }
    174. return 1;
    175. }
    176.  
    177. /**
    178. 把整个slab打散成一个个(也叫chunk)放到相应的slots链表中
    179. */
    180. static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
    181. slabclass_t *p = &slabclass[id];
    182. int x;
    183. for (x = 0; x < p->perslab; x++) {
    184. do_slabs_free(ptr, 0, id); //这个函数主要作用是让当前item空间可用,即加到slots链表中。
    185. ptr += p->size;
    186. }
    187. }
    188.  
    189. /**
    190. 为slabclass[id]分配新的slab,仅当当前的slabclass中slots没有空闲的空间才调用
    191. 此函数分配新的slab
    192. */
    193. static int do_slabs_newslab(const unsigned int id) {
    194. slabclass_t *p = &slabclass[id];
    195. int len = settings.slab_reassign ? settings.item_size_max
    196. : p->size * p->perslab; //先判断是否开启了自定义slab大小,如果没有就按默认的,即约1M
    197. char *ptr;
    198.  
    199. /**
    200. 下面if的逻辑是:
    201. 如果内存超出了限制,分配失败进入if,返回0
    202. 否则调用grow_slab_list检查是否要增大slab_list的大小
    203. 如果在grow_slab_list返回失败,则不继续分配空间,进入if,返回0
    204. 否则分配空间memory_allocate,如果分配失败,同样进入if,返回0;
    205. */
    206. if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
    207. (grow_slab_list(id) == 0) ||
    208. ((ptr = memory_allocate((size_t)len)) == 0)) {
    209.  
    210. MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
    211. return 0;
    212. }
    213.  
    214. memset(ptr, 0, (size_t)len); //清干净内存空间
    215. split_slab_page_into_freelist(ptr, id); //把新申请的slab放到slots中去
    216.  
    217. p->slab_list[p->slabs++] = ptr; //把新的slab加到slab_list数组中
    218. mem_malloced += len; //记下已分配的空间大小
    219. MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
    220.  
    221. return 1;
    222. }
    223.  
    224. /**
    225. 根据item大小和slabsclass分配空间
    226. */
    227. static void *do_slabs_alloc(const size_t size, unsigned int id) {
    228. slabclass_t *p;
    229. void *ret = NULL;
    230. item *it = NULL;
    231.  
    232. if (id < POWER_SMALLEST || id > power_largest) { //默认最大是200,最小是1
    233. MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
    234. return NULL;
    235. }
    236.  
    237. p = &slabclass[id]; //slabclass是一个全局变量,是各个slabclass对象数组,在这取得当前id对应的slabclass
    238. assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
    239.  
    240. /* fail unless we have space at the end of a recently allocated page,
    241. we have something on our freelist, or we could allocate a new page */
    242. /**
    243. 下面这个if的逻辑相当于:
    244. 如果p->sl_curr==0,即slots链表中没有空闲的空间,则do_slabs_newslab分配新slab
    245. 如果p->sl_curr==0,且do_slabs_newslab分配新slab失败,则进入if,ret = NULL,否则进入下面的elseif
    246. */
    247. if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) {
    248. /* We don't have more memory available */
    249. ret = NULL;
    250. } else if (p->sl_curr != 0) { //如果进入此分支是因为slots链表中还有空闲的空间
    251. /* return off our freelist */
    252. //把空闲的item分配出去
    253. it = (item *)p->slots;
    254. p->slots = it->next;
    255. if (it->next) it->next->prev = 0;
    256. p->sl_curr--;
    257. ret = (void *)it;
    258. }
    259.  
    260. if (ret) {
    261. p->requested += size; //分配成功,记下已分配的字节数
    262. MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
    263. } else {
    264. MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
    265. }
    266.  
    267. return ret;
    268. }
    269.  
    270. /**
    271. 这个函数的命名虽然叫do_slabs_free,听上去好像是释放空间,其实质是把空间变成可用。
    272. 怎样的空间才算可用?就是加到当前slabclass的slots链表中而已。
    273. 所以新申请的slab也会调用这个函数,让整个slab变为可用。
    274.  
    275. ps: 以前的memcached版本slots链表保存的是回收的item空间,而
    276. 现在保存的是所有可用的item空间。
    277. */
    278. static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
    279. slabclass_t *p;
    280. item *it;
    281.  
    282. assert(((item *)ptr)->slabs_clsid == 0);
    283. assert(id >= POWER_SMALLEST && id <= power_largest);
    284. if (id < POWER_SMALLEST || id > power_largest)
    285. return;
    286.  
    287. MEMCACHED_SLABS_FREE(size, id, ptr);
    288. p = &slabclass[id];
    289.  
    290. it = (item *)ptr;
    291. it->it_flags |= ITEM_SLABBED; //把item标记为slabbed状态
    292. it->prev = 0;
    293. it->next = p->slots;  //插入到slots链表中
    294. if (it->next) it->next->prev = it;
    295. p->slots = it;
    296.  
    297. p->sl_curr++; //空闲item数加1
    298. p->requested -= size;
    299. return;
    300. }
    301.  
    302. static int nz_strcmp(int nzlength, const char *nz, const char *z) {
    303. int zlength=strlen(z);
    304. return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
    305. }
    306.  
    307. bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
    308. bool ret = true;
    309.  
    310. if (add_stats != NULL) {
    311. if (!stat_type) {
    312. /* prepare general statistics for the engine */
    313. STATS_LOCK();
    314. APPEND_STAT("bytes", "%llu", (unsigned long long)stats.curr_bytes);
    315. APPEND_STAT("curr_items", "%u", stats.curr_items);
    316. APPEND_STAT("total_items", "%u", stats.total_items);
    317. STATS_UNLOCK();
    318. item_stats_totals(add_stats, c);
    319. } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
    320. item_stats(add_stats, c);
    321. } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
    322. slabs_stats(add_stats, c);
    323. } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
    324. item_stats_sizes(add_stats, c);
    325. } else {
    326. ret = false;
    327. }
    328. } else {
    329. ret = false;
    330. }
    331.  
    332. return ret;
    333. }
    334.  
    335. static void do_slabs_stats(ADD_STAT add_stats, void *c) {
    336. int i, total;
    337. /* Get the per-thread stats which contain some interesting aggregates */
    338. struct thread_stats thread_stats;
    339. threadlocal_stats_aggregate(&thread_stats);
    340.  
    341. total = 0;
    342. for(i = POWER_SMALLEST; i <= power_largest; i++) {
    343. slabclass_t *p = &slabclass[i];
    344. if (p->slabs != 0) {
    345. uint32_t perslab, slabs;
    346. slabs = p->slabs;
    347. perslab = p->perslab;
    348.  
    349. char key_str[STAT_KEY_LEN];
    350. char val_str[STAT_VAL_LEN];
    351. int klen = 0, vlen = 0;
    352.  
    353. APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
    354. APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
    355. APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
    356. APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
    357. APPEND_NUM_STAT(i, "used_chunks", "%u",
    358. slabs*perslab - p->sl_curr);
    359. APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
    360. /* Stat is dead, but displaying zero instead of removing it. */
    361. APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
    362. APPEND_NUM_STAT(i, "mem_requested", "%llu",
    363. (unsigned long long)p->requested);
    364. APPEND_NUM_STAT(i, "get_hits", "%llu",
    365. (unsigned long long)thread_stats.slab_stats[i].get_hits);
    366. APPEND_NUM_STAT(i, "cmd_set", "%llu",
    367. (unsigned long long)thread_stats.slab_stats[i].set_cmds);
    368. APPEND_NUM_STAT(i, "delete_hits", "%llu",
    369. (unsigned long long)thread_stats.slab_stats[i].delete_hits);
    370. APPEND_NUM_STAT(i, "incr_hits", "%llu",
    371. (unsigned long long)thread_stats.slab_stats[i].incr_hits);
    372. APPEND_NUM_STAT(i, "decr_hits", "%llu",
    373. (unsigned long long)thread_stats.slab_stats[i].decr_hits);
    374. APPEND_NUM_STAT(i, "cas_hits", "%llu",
    375. (unsigned long long)thread_stats.slab_stats[i].cas_hits);
    376. APPEND_NUM_STAT(i, "cas_badval", "%llu",
    377. (unsigned long long)thread_stats.slab_stats[i].cas_badval);
    378. APPEND_NUM_STAT(i, "touch_hits", "%llu",
    379. (unsigned long long)thread_stats.slab_stats[i].touch_hits);
    380. total++;
    381. }
    382. }
    383.  
    384. APPEND_STAT("active_slabs", "%d", total);
    385. APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
    386. add_stats(NULL, 0, NULL, 0, c);
    387. }
    388.  
    389. /**
    390. 分配内存空间
    391. */
    392. static void *memory_allocate(size_t size) {
    393. void *ret;
    394.  
    395. /**
    396. 有两种分配策略
    397. 1)如果是开启了内存预分配策略,则只需要从预分配好的内存块那里割一块出来。即进入下面的else分支
    398. 2)如果没有开启预分配,则malloc分配内存
    399.  
    400. 关于预分配详见 slabs_init
    401. */
    402. if (mem_base == NULL) {
    403. /* We are not using a preallocated large memory chunk */
    404. ret = malloc(size);
    405. } else {
    406. ret = mem_current;
    407.  
    408. if (size > mem_avail) {
    409. return NULL;
    410. }
    411.  
    412. /* mem_current pointer _must_ be aligned!!! */
    413. if (size % CHUNK_ALIGN_BYTES) {
    414. size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
    415. }
    416.  
    417. mem_current = ((char*)mem_current) + size;
    418. if (size < mem_avail) {
    419. mem_avail -= size;
    420. } else {
    421. mem_avail = 0;
    422. }
    423. }
    424.  
    425. return ret;
    426. }
    427.  
    428. void *slabs_alloc(size_t size, unsigned int id) {
    429. void *ret;
    430.  
    431. pthread_mutex_lock(&slabs_lock);
    432. ret = do_slabs_alloc(size, id);
    433. pthread_mutex_unlock(&slabs_lock);
    434. return ret;
    435. }
    436.  
    437. void slabs_free(void *ptr, size_t size, unsigned int id) {
    438. pthread_mutex_lock(&slabs_lock);
    439. do_slabs_free(ptr, size, id);
    440. pthread_mutex_unlock(&slabs_lock);
    441. }
    442.  
    443. void slabs_stats(ADD_STAT add_stats, void *c) {
    444. pthread_mutex_lock(&slabs_lock);
    445. do_slabs_stats(add_stats, c);
    446. pthread_mutex_unlock(&slabs_lock);
    447. }
    448.  
    449. void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
    450. {
    451. pthread_mutex_lock(&slabs_lock);
    452. slabclass_t *p;
    453. if (id < POWER_SMALLEST || id > power_largest) {
    454. fprintf(stderr, "Internal error! Invalid slab class ");
    455. abort();
    456. }
    457.  
    458. p = &slabclass[id];
    459. p->requested = p->requested - old + ntotal;
    460. pthread_mutex_unlock(&slabs_lock);
    461. }
    462.  
    463. static pthread_cond_t maintenance_cond = PTHREAD_COND_INITIALIZER;
    464. static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
    465. static volatile int do_run_slab_thread = 1;
    466. static volatile int do_run_slab_rebalance_thread = 1;
    467.  
    468. #define DEFAULT_SLAB_BULK_CHECK 1
    469. int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
    470.  
    471. static int slab_rebalance_start(void) {
    472. slabclass_t *s_cls;
    473. int no_go = 0;
    474.  
    475. pthread_mutex_lock(&cache_lock);
    476. pthread_mutex_lock(&slabs_lock);
    477.  
    478. if (slab_rebal.s_clsid < POWER_SMALLEST ||
    479. slab_rebal.s_clsid > power_largest  ||
    480. slab_rebal.d_clsid < POWER_SMALLEST ||
    481. slab_rebal.d_clsid > power_largest  ||
    482. slab_rebal.s_clsid == slab_rebal.d_clsid)
    483. no_go = -2;
    484.  
    485. s_cls = &slabclass[slab_rebal.s_clsid];
    486.  
    487. if (!grow_slab_list(slab_rebal.d_clsid)) {
    488. no_go = -1;
    489. }
    490.  
    491. if (s_cls->slabs < 2)
    492. no_go = -3;
    493.  
    494. if (no_go != 0) {
    495. pthread_mutex_unlock(&slabs_lock);
    496. pthread_mutex_unlock(&cache_lock);
    497. return no_go; /* Should use a wrapper function... */
    498. }
    499.  
    500. s_cls->killing = 1;
    501.  
    502. slab_rebal.slab_start = s_cls->slab_list[s_cls->killing - 1];
    503. slab_rebal.slab_end   = (char *)slab_rebal.slab_start +
    504. (s_cls->size * s_cls->perslab);
    505. slab_rebal.slab_pos   = slab_rebal.slab_start;
    506. slab_rebal.done       = 0;
    507.  
    508. /* Also tells do_item_get to search for items in this slab */
    509. slab_rebalance_signal = 2;
    510.  
    511. if (settings.verbose > 1) {
    512. fprintf(stderr, "Started a slab rebalance ");
    513. }
    514.  
    515. pthread_mutex_unlock(&slabs_lock);
    516. pthread_mutex_unlock(&cache_lock);
    517.  
    518. STATS_LOCK();
    519. stats.slab_reassign_running = true;
    520. STATS_UNLOCK();
    521.  
    522. return 0;
    523. }
    524.  
    525. enum move_status {
    526. MOVE_PASS=0, MOVE_DONE, MOVE_BUSY, MOVE_LOCKED
    527. };
    528.  
    529. static int slab_rebalance_move(void) {
    530. slabclass_t *s_cls;
    531. int x;
    532. int was_busy = 0;
    533. int refcount = 0;
    534. enum move_status status = MOVE_PASS;
    535.  
    536. pthread_mutex_lock(&cache_lock);
    537. pthread_mutex_lock(&slabs_lock);
    538.  
    539. s_cls = &slabclass[slab_rebal.s_clsid];
    540.  
    541. for (x = 0; x < slab_bulk_check; x++) {
    542. item *it = slab_rebal.slab_pos;
    543. status = MOVE_PASS;
    544. if (it->slabs_clsid != 255) {
    545. void *hold_lock = NULL;
    546. uint32_t hv = hash(ITEM_key(it), it->nkey);
    547. if ((hold_lock = item_trylock(hv)) == NULL) {
    548. status = MOVE_LOCKED;
    549. } else {
    550. refcount = refcount_incr(&it->refcount);
    551. if (refcount == 1) { /* item is unlinked, unused */
    552. if (it->it_flags & ITEM_SLABBED) {
    553. /* remove from slab freelist */
    554. if (s_cls->slots == it) {
    555. s_cls->slots = it->next;
    556. }
    557. if (it->next) it->next->prev = it->prev;
    558. if (it->prev) it->prev->next = it->next;
    559. s_cls->sl_curr--;
    560. status = MOVE_DONE;
    561. } else {
    562. status = MOVE_BUSY;
    563. }
    564. } else if (refcount == 2) { /* item is linked but not busy */
    565. if ((it->it_flags & ITEM_LINKED) != 0) {
    566. do_item_unlink_nolock(it, hv);
    567. status = MOVE_DONE;
    568. } else {
    569. /* refcount == 1 + !ITEM_LINKED means the item is being
    570. * uploaded to, or was just unlinked but hasn't been freed
    571. * yet. Let it bleed off on its own and try again later */
    572. status = MOVE_BUSY;
    573. }
    574. } else {
    575. if (settings.verbose > 2) {
    576. fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d) ",
    577. it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
    578. }
    579. status = MOVE_BUSY;
    580. }
    581. item_trylock_unlock(hold_lock);
    582. }
    583. }
    584.  
    585. switch (status) {
    586. case MOVE_DONE:
    587. it->refcount = 0;
    588. it->it_flags = 0;
    589. it->slabs_clsid = 255;
    590. break;
    591. case MOVE_BUSY:
    592. refcount_decr(&it->refcount);
    593. case MOVE_LOCKED:
    594. slab_rebal.busy_items++;
    595. was_busy++;
    596. break;
    597. case MOVE_PASS:
    598. break;
    599. }
    600.  
    601. slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
    602. if (slab_rebal.slab_pos >= slab_rebal.slab_end)
    603. break;
    604. }
    605.  
    606. if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
    607. /* Some items were busy, start again from the top */
    608. if (slab_rebal.busy_items) {
    609. slab_rebal.slab_pos = slab_rebal.slab_start;
    610. slab_rebal.busy_items = 0;
    611. } else {
    612. slab_rebal.done++;
    613. }
    614. }
    615.  
    616. pthread_mutex_unlock(&slabs_lock);
    617. pthread_mutex_unlock(&cache_lock);
    618.  
    619. return was_busy;
    620. }
    621.  
    622. static void slab_rebalance_finish(void) {
    623. slabclass_t *s_cls;
    624. slabclass_t *d_cls;
    625.  
    626. pthread_mutex_lock(&cache_lock);
    627. pthread_mutex_lock(&slabs_lock);
    628.  
    629. s_cls = &slabclass[slab_rebal.s_clsid];
    630. d_cls   = &slabclass[slab_rebal.d_clsid];
    631.  
    632. /* At this point the stolen slab is completely clear */
    633. s_cls->slab_list[s_cls->killing - 1] =
    634. s_cls->slab_list[s_cls->slabs - 1];
    635. s_cls->slabs--;
    636. s_cls->killing = 0;
    637.  
    638. memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
    639.  
    640. d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
    641. split_slab_page_into_freelist(slab_rebal.slab_start,
    642. slab_rebal.d_clsid);
    643.  
    644. slab_rebal.done       = 0;
    645. slab_rebal.s_clsid    = 0;
    646. slab_rebal.d_clsid    = 0;
    647. slab_rebal.slab_start = NULL;
    648. slab_rebal.slab_end   = NULL;
    649. slab_rebal.slab_pos   = NULL;
    650.  
    651. slab_rebalance_signal = 0;
    652.  
    653. pthread_mutex_unlock(&slabs_lock);
    654. pthread_mutex_unlock(&cache_lock);
    655.  
    656. STATS_LOCK();
    657. stats.slab_reassign_running = false;
    658. stats.slabs_moved++;
    659. STATS_UNLOCK();
    660.  
    661. if (settings.verbose > 1) {
    662. fprintf(stderr, "finished a slab move ");
    663. }
    664. }
    665.  
    666. /*
    667. slab自动重分配时,执行此函数做出重分配方案决定
    668. */
    669. static int slab_automove_decision(int *src, int *dst) {
    670. static uint64_t evicted_old[POWER_LARGEST];
    671. static unsigned int slab_zeroes[POWER_LARGEST];
    672. static unsigned int slab_winner = 0;
    673. static unsigned int slab_wins   = 0;
    674. uint64_t evicted_new[POWER_LARGEST];
    675. uint64_t evicted_diff = 0;
    676. uint64_t evicted_max  = 0;
    677. unsigned int highest_slab = 0;
    678. unsigned int total_pages[POWER_LARGEST];
    679. int i;
    680. int source = 0;
    681. int dest = 0;
    682. static rel_time_t next_run;
    683.  
    684. /* Run less frequently than the slabmove tester. */
    685. if (current_time >= next_run) {
    686. next_run = current_time + 10;
    687. } else {
    688. return 0;
    689. }
    690.  
    691. item_stats_evictions(evicted_new);
    692. pthread_mutex_lock(&cache_lock);
    693. for (i = POWER_SMALLEST; i < power_largest; i++) {
    694. total_pages[i] = slabclass[i].slabs;
    695. }
    696. pthread_mutex_unlock(&cache_lock);
    697.  
    698. /* Find a candidate source; something with zero evicts 3+ times */
    699. for (i = POWER_SMALLEST; i < power_largest; i++) {
    700. evicted_diff = evicted_new[i] - evicted_old[i];
    701. if (evicted_diff == 0 && total_pages[i] > 2) {
    702. slab_zeroes[i]++;
    703. if (source == 0 && slab_zeroes[i] >= 3)
    704. source = i;
    705. } else {
    706. slab_zeroes[i] = 0;
    707. if (evicted_diff > evicted_max) {
    708. evicted_max = evicted_diff;
    709. highest_slab = i;
    710. }
    711. }
    712. evicted_old[i] = evicted_new[i];
    713. }
    714.  
    715. /* Pick a valid destination */
    716. if (slab_winner != 0 && slab_winner == highest_slab) {
    717. slab_wins++;
    718. if (slab_wins >= 3)
    719. dest = slab_winner;
    720. } else {
    721. slab_wins = 1;
    722. slab_winner = highest_slab;
    723. }
    724.  
    725. if (source && dest) {
    726. *src = source;
    727. *dst = dest;
    728. return 1;
    729. }
    730. return 0;
    731. }
    732.  
    733. /* Slab rebalancer thread.
    734. * Does not use spinlocks since it is not timing sensitive. Burn less CPU and
    735. * go to sleep if locks are contended
    736. 运行slab维护线程,slab维护线程的执行入口
    737. */
    738. static void *slab_maintenance_thread(void *arg) {
    739. int src, dest;
    740.  
    741. while (do_run_slab_thread) {
    742. if (settings.slab_automove == 1) {
    743. if (slab_automove_decision(&src, &dest) == 1) {
    744. /* Blind to the return codes. It will retry on its own */
    745. slabs_reassign(src, dest); //移动slab,重分配
    746. }
    747. sleep(1);
    748. } else {
    749. /* Don't wake as often if we're not enabled.
    750. * This is lazier than setting up a condition right now. */
    751. sleep(5);
    752. }
    753. }
    754. return NULL;
    755. }
    756.  
    757. /* Slab mover thread.
    758. * Sits waiting for a condition to jump off and shovel some memory about
    759. */
    760. static void *slab_rebalance_thread(void *arg) {
    761. int was_busy = 0;
    762. /* So we first pass into cond_wait with the mutex held */
    763. mutex_lock(&slabs_rebalance_lock);
    764.  
    765. while (do_run_slab_rebalance_thread) {
    766. if (slab_rebalance_signal == 1) {
    767. if (slab_rebalance_start() < 0) {
    768. /* Handle errors with more specifity as required. */
    769. slab_rebalance_signal = 0;
    770. }
    771.  
    772. was_busy = 0;
    773. } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
    774. was_busy = slab_rebalance_move();
    775. }
    776.  
    777. if (slab_rebal.done) {
    778. slab_rebalance_finish();
    779. } else if (was_busy) {
    780. /* Stuck waiting for some items to unlock, so slow down a bit
    781. * to give them a chance to free up */
    782. usleep(50);
    783. }
    784.  
    785. if (slab_rebalance_signal == 0) {
    786. /* always hold this lock while we're running */
    787. pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
    788. }
    789. }
    790. return NULL;
    791. }
    792.  
    793. static int slabs_reassign_pick_any(int dst) {
    794. static int cur = POWER_SMALLEST - 1;
    795. int tries = power_largest - POWER_SMALLEST + 1;
    796. for (; tries > 0; tries--) {
    797. cur++;
    798. if (cur > power_largest)
    799. cur = POWER_SMALLEST;
    800. if (cur == dst)
    801. continue;
    802. if (slabclass[cur].slabs > 1) {
    803. return cur;
    804. }
    805. }
    806. return -1;
    807. }
    808.  
    809. static enum reassign_result_type do_slabs_reassign(int src, int dst) {
    810. if (slab_rebalance_signal != 0)
    811. return REASSIGN_RUNNING;
    812.  
    813. if (src == dst)
    814. return REASSIGN_SRC_DST_SAME;
    815.  
    816. /* Special indicator to choose ourselves. */
    817. if (src == -1) {
    818. src = slabs_reassign_pick_any(dst);
    819. /* TODO: If we end up back at -1, return a new error type */
    820. }
    821.  
    822. if (src < POWER_SMALLEST || src > power_largest ||
    823. dst < POWER_SMALLEST || dst > power_largest)
    824. return REASSIGN_BADCLASS;
    825.  
    826. if (slabclass[src].slabs < 2)
    827. return REASSIGN_NOSPARE;
    828.  
    829. slab_rebal.s_clsid = src;
    830. slab_rebal.d_clsid = dst;
    831.  
    832. slab_rebalance_signal = 1;
    833. pthread_cond_signal(&slab_rebalance_cond);
    834.  
    835. return REASSIGN_OK;
    836. }
    837.  
    838. enum reassign_result_type slabs_reassign(int src, int dst) {
    839. enum reassign_result_type ret;
    840. if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
    841. return REASSIGN_RUNNING;
    842. }
    843. ret = do_slabs_reassign(src, dst);
    844. pthread_mutex_unlock(&slabs_rebalance_lock);
    845. return ret;
    846. }
    847.  
    848. /* If we hold this lock, rebalancer can't wake up or move */
    849. void slabs_rebalancer_pause(void) {
    850. pthread_mutex_lock(&slabs_rebalance_lock);
    851. }
    852.  
    853. void slabs_rebalancer_resume(void) {
    854. pthread_mutex_unlock(&slabs_rebalance_lock);
    855. }
    856.  
    857. static pthread_t maintenance_tid;
    858. static pthread_t rebalance_tid;
    859.  
    860. /**
    861. 启动slab维护线程
    862. */
    863. int start_slab_maintenance_thread(void) {
    864. int ret;
    865. slab_rebalance_signal = 0;
    866. slab_rebal.slab_start = NULL;
    867. char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
    868. if (env != NULL) {
    869. slab_bulk_check = atoi(env);
    870. if (slab_bulk_check == 0) {
    871. slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
    872. }
    873. }
    874.  
    875. if (pthread_cond_init(&slab_rebalance_cond, NULL) != 0) {
    876. fprintf(stderr, "Can't intiialize rebalance condition ");
    877. return -1;
    878. }
    879. pthread_mutex_init(&slabs_rebalance_lock, NULL);
    880.  
    881. if ((ret = pthread_create(&maintenance_tid, NULL,
    882. slab_maintenance_thread, NULL)) != 0) {
    883. fprintf(stderr, "Can't create slab maint thread: %s ", strerror(ret));
    884. return -1;
    885. }
    886. if ((ret = pthread_create(&rebalance_tid, NULL,
    887. slab_rebalance_thread, NULL)) != 0) {
    888. fprintf(stderr, "Can't create rebal thread: %s ", strerror(ret));
    889. return -1;
    890. }
    891. return 0;
    892. }
    893.  
    894. /**
    895. 停止slab维护线程,逻辑和停止哈希表维护线程一样。
    896. */
    897. void stop_slab_maintenance_thread(void) {
    898. mutex_lock(&cache_lock);
    899. do_run_slab_thread = 0;
    900. do_run_slab_rebalance_thread = 0;
    901. pthread_cond_signal(&maintenance_cond);
    902. pthread_mutex_unlock(&cache_lock);
    903.  
    904. /* Wait for the maintenance thread to stop */
    905. pthread_join(maintenance_tid, NULL);
    906. pthread_join(rebalance_tid, NULL);
    907. }
  • 相关阅读:
    为WCF增加UDP绑定(实践篇)
    WCF服务编程-基础
    Entity Framework 数据库先行、模型先行、代码先行
    C# FTP上传下载(支持断点续传)
    使用WCF上传文件
    MySQL · BUG分析 · Rename table 死锁分析
    MySQL · 物理备份 · Percona XtraBackup 备份原理
    MySQL · 答疑解惑 · MySQL 锁问题最佳实践
    MySQL · 特性分析 · MDL 实现分析
    MySQL · 引擎特性 · InnoDB 事务子系统介绍
  • 原文地址:https://www.cnblogs.com/guolanzhu/p/5850272.html
Copyright © 2011-2022 走看看