zoukankan      html  css  js  c++  java
  • requests+mongodb爬取今日头条,多进程

      1 import json
      2 import os
      3 from urllib.parse import urlencode
      4 import pymongo
      5 import requests
      6 from bs4 import BeautifulSoup
      7 from requests.exceptions import ConnectionError
      8 import re
      9 from multiprocessing import Pool
     10 from hashlib import md5
     11 from json.decoder import JSONDecodeError
     12 from config import *
     13 
     14 client = pymongo.MongoClient(MONGO_URL, connect=False)
     15 db = client[MONGO_DB]
     16 
     17 
     18 def get_page_index(offset, keyword):
     19     data = {
     20         'autoload': 'true',
     21         'count': 20,
     22         'cur_tab': 3,
     23         'format': 'json',
     24         'keyword': keyword,
     25         'offset': offset,
     26     }
     27     params = urlencode(data)
     28     base = 'http://www.toutiao.com/search_content/'
     29     url = base + '?' + params
     30     try:
     31         response = requests.get(url)
     32         if response.status_code == 200:
     33             return response.text
     34         return None
     35     except ConnectionError:
     36         print('Error occurred')
     37         return None
     38 
     39 
     40 def download_image(url):
     41     print('Downloading', url)
     42     try:
     43         response = requests.get(url)
     44         if response.status_code == 200:
     45             save_image(response.content)
     46         return None
     47     except ConnectionError:
     48         return None
     49 
     50 
     51 def save_image(content):
     52     file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
     53     print(file_path)
     54     if not os.path.exists(file_path):
     55         with open(file_path, 'wb') as f:
     56             f.write(content)
     57             f.close()
     58 
     59 
     60 def parse_page_index(text):
     61     try:
     62         data = json.loads(text)
     63         if data and 'data' in data.keys():
     64             for item in data.get('data'):
     65                 yield item.get('article_url')
     66     except JSONDecodeError:
     67         pass
     68 
     69 
     70 def get_page_detail(url):
     71     try:
     72         response = requests.get(url)
     73         if response.status_code == 200:
     74             return response.text
     75         return None
     76     except ConnectionError:
     77         print('Error occurred')
     78         return None
     79 
     80 
     81 def parse_page_detail(html, url):
     82     soup = BeautifulSoup(html, 'lxml')
     83     result = soup.select('title')
     84     title = result[0].get_text() if result else ''
     85     images_pattern = re.compile('gallery: JSON.parse("(.*)")', re.S)
     86     result = re.search(images_pattern, html)
     87     if result:
     88         data = json.loads(result.group(1).replace('\', ''))
     89         if data and 'sub_images' in data.keys():
     90             sub_images = data.get('sub_images')
     91             images = [item.get('url') for item in sub_images]
     92             for image in images: download_image(image)
     93             return {
     94                 'title': title,
     95                 'url': url,
     96                 'images': images
     97             }
     98 
     99 
    100 def save_to_mongo(result):
    101     if db[MONGO_TABLE].insert(result):
    102         print('Successfully Saved to Mongo', result)
    103         return True
    104     return False
    105 
    106 
    107 def main(offset):
    108     text = get_page_index(offset, KEYWORD)
    109     urls = parse_page_index(text)
    110     for url in urls:
    111         html = get_page_detail(url)
    112         result = parse_page_detail(html, url)
    113         if result: save_to_mongo(result)
    114 
    115 
    116 if __name__ == '__main__':
    117     pool = Pool()
    118     groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    119     pool.map(main, groups)
    120     pool.close()
    121     pool.join()
  • 相关阅读:
    [贪心经典算法]Kruskal算法
    [经典贪心算法]Prim算法
    Java容器之List接口
    Java容器之Set接口
    JDK中的泛型
    Java中ArrayList与数组间相互转换
    Java中的增强for循环
    Java容器之Iterator接口
    Java之容器
    eg_4
  • 原文地址:https://www.cnblogs.com/zhongshuiping/p/9714351.html
Copyright © 2011-2022 走看看