zoukankan      html  css  js  c++  java
  • Python 多进程爬虫实例

    Python  多进程爬虫实例

    import json
    import re
    import time
    from multiprocessing import Pool
    import requests
    from requests.exceptions import RequestException
    from bs4 import BeautifulSoup
    
    
    def get_one_page(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    
    def parse_one_page(html):
        data_list = []
        soup = BeautifulSoup(html, "lxml")
        index_list = soup.select('i.board-index')
        img_list = [x['data-src'] for x in soup.findAll('img', {'class': 'board-img'})]
        name_list = soup.select('p.name')
        actor_list = soup.select('p.star')
        time_list = soup.select('p.releasetime')
        score_list = soup.select('p.score')
        for i in range(len(index_list)):
            data_list.append({
                'index': index_list[i].get_text(),
                'image': img_list[i],
                'title': name_list[i].get_text(),
                'actor': actor_list[i].get_text().strip(),
                'time': time_list[i].get_text(),
                'score': score_list[i].get_text()
            })
        return data_list
    
    
    def write_to_file(content):
        with open('resul1t.txt', 'a', encoding='utf-8') as f:
            f.write(json.dumps(content, ensure_ascii=False) + '
    ')
            f.close()
    
    
    def main(offset_list):
        for offset in offset_list:
            url = 'http://maoyan.com/board/4?offset=' + str(offset)
            html = get_one_page(url)
            if html:
                for item in parse_one_page(html):
                    write_to_file(item)
    
    
    if __name__ == '__main__':
        # pool = Pool()
        # pool.map(main, [i * 10 for i in range(10)])
        # pool.close()
        # pool.join()
        # main(1)
    
    
        offset_list = list(range(0, 100, 10))  # 多进程
        p = Pool()
        for index in range(5):
            p.apply_async(main, args=(offset_list[index * 2:(index + 1) * 2],))
    
        p.close()
        p.join()
  • 相关阅读:
    UNIX:处理SIGCHLD信号
    多维数组,指针数组使用,指向指针的指针
    bit field
    链表操作,获得泛型效果
    简明 Vim 练级攻略
    指针3,指向链表对象指针的指针
    大端模式,指针数组
    C宏设置掩码
    springboot 启动报错: Multiple Dockets with the same group name are not supported. The following duplicat
    HTML5学习笔记三
  • 原文地址:https://www.cnblogs.com/zhaoyingjie/p/11239828.html
Copyright © 2011-2022 走看看