zoukankan      html  css  js  c++  java
  • 多进程的妙用

     1 #coding:utf-8
     2 import time
     3 import threading
     4 from html_downLoader import HtmlDownLoader
     5 import ParseAlexa
     6 import multiprocessing
     7 from MongoQueue import MongoQueue
     8 import sys
     9 if sys.getdefaultencoding()!="utf-8":
    10     reload(sys)
    11     sys.setdefaultencoding("utf-8")
    12 SLEEP_TIME=1
    13 alexaCallback=ParseAlexa.AlexaCallback()
    14 crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
    15 max_threads=5
    16 result={}
    17 def threaded_crawler():
    18     threads=[]
    19     #crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
    20     dlownloader=HtmlDownLoader()
    21     def process_queue():
    22         while True:
    23             try:
    24                 url=crawl_queue.pop()
    25                 crawl_queue.complete(url)
    26             except Exception,e:
    27                 print e.message
    28                 break
    29             else:
    30                 print "正在爬取%s"%url
    31                 html=dlownloader.downLoad(url)
    32                 result[url]=html
    33 
    34     while threads or crawl_queue.__nonzero__():
    35         while len(threads)<max_threads and crawl_queue.__nonzero__():
    36             thread=threading.Thread(target=process_queue)
    37             thread.setDaemon(True)
    38             thread.start()
    39             threads.append(thread)
    40             time.sleep(SLEEP_TIME)
    41         for thread in threads:
    42             if not thread.is_alive():
    43                 threads.remove(thread)
    44     print result,'
    
    
    
    
    '
    45 
    46 def process_crawler():
    47     num_cpus=multiprocessing.cpu_count()
    48     print "Starting {} process".format(num_cpus)
    49     process=[]
    50     for i in range(num_cpus):
    51         p=multiprocessing.Process(target=threaded_crawler)
    52         p.daemon=True
    53         p.start()
    54         # p.join()
    55         process.append(p)
    56     for p in process:
    57         p.join()
    58     # print result
    59 if __name__ == '__main__':
    60     #alexaCallback=ParseAlexa.AlexaCallback()
    61     #threaded_crawler(alexaCallback)
    62     process_crawler()
    63     # print result
  • 相关阅读:
    2.六角星绘制
    1.五角星绘制
    Redis
    javaScript
    反射
    区分'方法'和'函数'
    递归,二分法
    匿名函数,排序函数,过滤函数,映射函数,
    生成器,生成器函数,推导式,生成器表达式.
    函数,闭包,迭代器
  • 原文地址:https://www.cnblogs.com/zhongshuiping/p/9815102.html
Copyright © 2011-2022 走看看