zoukankan      html  css  js  c++  java
  • 多线程爬取猫眼电影TOP100并保存到mongo数据库中

     1 import requests
     2 import re
     3 import json
     4 from requests.exceptions import RequestException
     5 from multiprocessing import Pool
     6 
     7 # 获取网页
     8 def get_one_page(url):
     9     headers = {
    10         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'+
    11                      'Chrome/63.0.3239.132 Safari/537.36'}
    12     try:
    13         resp = requests.get(url,headers=headers)
    14         if resp.status_code == 200:
    15             return resp.text
    16         return None
    17     except RequestException:
    18         return None
    19 # 解析网页
    20 def parse_one_page(html):
    21     pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?name"><a'
    22                          +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
    23                          +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    24     items = re.findall(pattern,html)
    25     for item in items:
    26         yield{
    27             'index':item[0],
    28             'title':item[1],
    29             'actor':item[2].strip(),
    30             'time':item[3],
    31             'score':item[4]+item[5],
    32             # 'image': item[6],
    33         }
    34 # 保存数据
    35 def write_to_file(content):
    36     with open('TOP1OO.txt','a',encoding='utf-8') as f:
    37         f.write(json.dumps(content,ensure_ascii=False)+'
    ')
    38 
    39 def main(offset):
    40     url = 'http://maoyan.com/board/4?offset='+str(offset)
    41     html = get_one_page(url)
    42     for item in parse_one_page(html):
    43         write_to_file(item)
    44 
    45 if __name__ == '__main__':
    46     # for i in range(10):
    47     #     main(i*10)
    48     pool = Pool()
    49     pool.map(main,[i*10 for i in range(10)])
  • 相关阅读:
    noip2018练习总结
    东方CannonBall (概率DP)
    数论
    逆序对
    USACO5.3 校园网Network of Schools(Tarjan缩点)
    USACO09FEB 改造路Revamping Trails(分层图模板)
    Comet OJ模拟赛 Day1
    Tarjan模板
    NOIP 天天爱跑步(树上差分)
    树上差分
  • 原文地址:https://www.cnblogs.com/ray-mmss/p/9375921.html
Copyright © 2011-2022 走看看