一旦有一个抓取请求开始,就创建一个新的线程,由该线程执行任务,任务执行完毕之后,线程就退出。这就是"即时创建,即时销毁"的策略。尽管与创建进程相比,创建线程的时间已经大大的缩短,但是如果提交给线程的任务是执行时间较短,而且执行次数非常频繁,那么服务器就将处于一个不停的创建线程和销毁线程的状态。这笔开销是不可忽略的,尤其是线程执行的时间非常非常短的情况。
线程池就是为了解决上述问题的,它的实现原理是这样的:在应用程序启动之后,就马上创建一定数量的线程,放入空闲的队列中。这些线程都是处于阻塞状态,这些线程只占一点内存,不占用CPU。当任务到来后,线程池将选择一个空闲的线程,将任务传入此线程中运行。当所有的线程都处在处理任务的时候,线程池将自动创建一定的数量的新线程,用于处理更多的任务。执行任务完成之后线程并不退出,而是继续在线程池中等待下一次任务。当大部分线程处于阻塞状态时,线程池将自动销毁一部分的线程,回收系统资源。
下面是通过一个线程池的实现,来解决多任务抓取问题。
处理流程如下:
程序启动之前,初始化线程池,启动线程池中的线程,由于还没有任务到来,线程池中的所有线程都处在阻塞状态,当一有任务到达就从线程池中取出一个空闲线程处理,如果所有的线程都处于工作状态,就添加到队列,进行排队。如果队列中的任务个数大于队列的所能容纳的最大数量,那就不能添加任务到队列中,只能等待队列不满才能添加任务到队列中。
struct job { void* (*callback_function)(void *arg); //线程回调函数 void *arg; //回调函数参数 struct job *next; }; struct threadpool { int thread_num; //线程池中开启线程的个数 int queue_max_num; //队列中最大job的个数 struct job *head; //指向job的头指针 struct job *tail; //指向job的尾指针 pthread_t *pthreads; //线程池中所有线程的pthread_t pthread_mutex_t mutex; //互斥信号量 pthread_cond_t queue_empty; //队列为空的条件变量 pthread_cond_t queue_not_empty; //队列不为空的条件变量 pthread_cond_t queue_not_full; //队列不为满的条件变量 int queue_cur_num; //队列当前的job个数 int queue_close; //队列是否已经关闭 int pool_close; //线程池是否已经关闭 }; struct threadpool* threadpool_init(intthread_num, int queue_max_num) { struct threadpool *pool = NULL; do { pool = malloc(sizeof(struct threadpool)); if (NULL == pool) { printf("failed to malloc threadpool! "); break; } pool->thread_num = thread_num; pool->queue_max_num = queue_max_num; pool->queue_cur_num = 0; pool->head = NULL; pool->tail = NULL; if (pthread_mutex_init(&(pool->mutex), NULL)) { printf("failed to init mutex! "); break; } if (pthread_cond_init(&(pool->queue_empty), NULL)) { printf("failed to init queue_empty! "); break; } if (pthread_cond_init(&(pool->queue_not_empty), NULL)) { printf("failed to init queue_not_empty! "); break; } if (pthread_cond_init(&(pool->queue_not_full), NULL)) { printf("failed to init queue_not_full! "); break; } pool->pthreads = malloc(sizeof(pthread_t) * thread_num); if (NULL == pool->pthreads) { printf("failed to malloc pthreads! "); break; } pool->queue_close = 0; pool->pool_close = 0; int i; for (i = 0; i < pool->thread_num; ++i) { pthread_create(&(pool->pthreads[i]), NULL, threadpool_function,(void *)pool); } return pool; }while (0); return NULL; } int threadpool_add_job(struct threadpool*pool, void* (*callback_function)(void *arg), void *arg) { assert(pool != NULL); assert(callback_function != NULL); assert(arg != NULL); pthread_mutex_lock(&(pool->mutex)); while ((pool->queue_cur_num == pool->queue_max_num) &&!(pool->queue_close || pool->pool_close)) { pthread_cond_wait(&(pool->queue_not_full),&(pool->mutex)); //队列满的时候就等待 } if (pool->queue_close || pool->pool_close) //队列关闭或者线程池关闭就退出 { pthread_mutex_unlock(&(pool->mutex)); return -1; } struct job *pjob =(struct job*) malloc(sizeof(struct job)); if (NULL == pjob) { pthread_mutex_unlock(&(pool->mutex)); return -1; } pjob->callback_function = callback_function; pjob->arg = arg; pjob->next = NULL; if (pool->head == NULL) { pool->head = pool->tail = pjob; pthread_cond_broadcast(&(pool->queue_not_empty)); //队列空的时候,有任务来时就通知线程池中的线程:队列非空 } else { pool->tail->next = pjob; pool->tail = pjob; } pool->queue_cur_num++; pthread_mutex_unlock(&(pool->mutex)); return 0; } void* threadpool_function(void* arg) { struct threadpool *pool = (struct threadpool*)arg; struct job *pjob = NULL; while (1) //死循环 { pthread_mutex_lock(&(pool->mutex)); while ((pool->queue_cur_num == 0) &&!pool->pool_close) //队列为空时,就等待队列非空 { pthread_cond_wait(&(pool->queue_not_empty),&(pool->mutex)); } if (pool->pool_close) //线程池关闭,线程就退出 { pthread_mutex_unlock(&(pool->mutex)); pthread_exit(NULL); } pool->queue_cur_num--; pjob = pool->head; if (pool->queue_cur_num == 0) { pool->head = pool->tail = NULL; } else { pool->head = pjob->next; } if (pool->queue_cur_num == 0) { pthread_cond_signal(&(pool->queue_empty)); //队列为空,就可以通知threadpool_destroy函数,销毁线程函数 } if (pool->queue_cur_num == pool->queue_max_num - 1) { pthread_cond_broadcast(&(pool->queue_not_full)); //队列非满,就可以通知threadpool_add_job函数,添加新任务 } pthread_mutex_unlock(&(pool->mutex)); (*(pjob->callback_function))(pjob->arg); //线程真正要做的工作,回调函数的调用 free(pjob); pjob = NULL; } } int threadpool_destroy(struct threadpool*pool) { assert(pool != NULL); pthread_mutex_lock(&(pool->mutex)); if (pool->queue_close || pool->pool_close) //线程池已经退出了,就直接返回 { pthread_mutex_unlock(&(pool->mutex)); return -1; } pool->queue_close = 1; //置队列关闭标志 while (pool->queue_cur_num != 0) { pthread_cond_wait(&(pool->queue_empty), &(pool->mutex)); //等待队列为空 } pool->pool_close = 1; //置线程池关闭标志 pthread_mutex_unlock(&(pool->mutex)); pthread_cond_broadcast(&(pool->queue_not_empty)); //唤醒线程池中正在阻塞的线程 pthread_cond_broadcast(&(pool->queue_not_full)); //唤醒添加任务的threadpool_add_job函数 int i; for (i = 0; i < pool->thread_num; ++i) { pthread_join(pool->pthreads[i], NULL); //等待线程池的所有线程执行完毕 } pthread_mutex_destroy(&(pool->mutex)); //清理资源 pthread_cond_destroy(&(pool->queue_empty)); pthread_cond_destroy(&(pool->queue_not_empty)); pthread_cond_destroy(&(pool->queue_not_full)); free(pool->pthreads); struct job *p; while (pool->head != NULL) { p = pool->head; pool->head = p->next; free(p); } free(pool); return 0; }