zoukankan      html  css  js  c++  java
  • python多进程

    以抓取猫眼的Top100热门电影的信息为例:

    # -*- coding: utf-8 -*-
    import urllib
    import urllib2
    import re
    import json
    import lxml.html
    import time
    import datetime
    from bs4 import BeautifulSoup
    import multiprocessing
    from multiprocessing import Pool
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    fd = open('E:\result.txt', 'w')
    URL = 'http://maoyan.com/board/4'
    
    def download(url, user_agent='wswp', num_try=2):
    
        headers = {'User_agent': user_agent}
        request = urllib2.Request(url, headers=headers)
        try:
            html = urllib2.urlopen(request).read()
        except urllib2.URLError as e:
            print 'Download error', e.reason
            html = None
            if num_try > 0:
                if hasattr(e, 'code') and 500 <= e.code < 600:
                    return download(url, user_agent, num_try - 1)
                elif e.code == 403:
                    return None
        return html
    
    
    def get_message(url):
        html = download(url)
        soup = BeautifulSoup(html,'lxml')
        results = soup.find_all(name = 'div',attrs = {'class':'movie-item-info'})
        res_rank = r'<i class="board-index board-index-.*?">(.*?)</i>'
        rank = re.findall(res_rank,html)
        res_title = r'<p class="name"><.*?>(.*?)</a>'
        title = re.findall(res_title,html,re.S|re.M)
        res_major = r'<p class="star">(.*?)</p>'
        major = re.findall(res_major,html,re.S|re.M)
        res_data = r'<p class="releasetime">(.*?)</p>'
        data = re.findall(res_data,html,re.S|re.M)
        res_inte = r'<i class="integer">(.*?)</i>'
        inte = re.findall(res_inte,html,re.S|re.M)
        res_pe = r'<i class="fraction">(.*?)</i>'
        pe = re.findall(res_pe,html,re.S|re.M)
        for each in range(0,9):
            print title[each]
            mess = 'Rand:'+rank[each]
            fd.write(mess)
            mess = '电影:' + title[each]
            fd.write(mess)
            mess = '评分 ' + inte[each] + pe[each]
            fd.write(mess)
            mess = major[each].replace(' ','')
            fd.write(mess)
            mess = data[each]
            fd.write(mess)
            fd.write('
    ')
    
    
    def main(offset):
    
        url = 'http://maoyan.com/board/4?offset={}'.format(offset)
        print url
        get_message(url)
    
    if __name__ == '__main__':
        t = time.time()
        for i in range(10):
             main(i*10)
        t1 = time.time()
        print 'Total time:'
        print t1 - t
        fd.close()

    单进程的代码所花费的时间是:

    利用多进程的Pool的时间是:

    pool更改的代码是:

    pool = Pool()
    pool.map(main, [i * 10 for i in range(10)])
  • 相关阅读:
    Mongo库表占用空间统计
    修改elasticsearch默认索引返回数
    针对docker中的mongo容器增加鉴权
    自动化测试框架STAF介绍
    单点登陆思想
    Django请求流程
    python冒泡排序,可对list中的字典进行排序
    python合并list中各个字典中相同条件的累计数
    哎,linux nginx命令就是记不住啊
    python利用urllib2读取webservice接口数据
  • 原文地址:https://www.cnblogs.com/chenyang920/p/7308348.html
Copyright © 2011-2022 走看看