zoukankan      html  css  js  c++  java
  • 分析Ajax爬取今日头条街拍美图

     1 import os
     2 import requests
     3 from urllib.parse import urlencode
     4 from hashlib import md5
     5 from multiprocessing.pool import Pool
     6 
     7 GROUP_START = 1
     8 GROUP_END = 5
     9 
    10 def get_page(offset):
    11     params = {
    12         'offset': offset,
    13         'format': 'json',
    14         'keyword': '街拍',
    15         'autoload': 'true',
    16         'count': '20',
    17         'cur_tab': '3',
    18         'from': 'gallery',
    19     }
    20     url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
    21     try:
    22         response = requests.get(url)
    23         if response.status_code == 200:
    24             return response.json()
    25     except requests.ConnectionError:
    26         return None
    27 
    28 def get_images(json):
    29     data = json.get('data')
    30     if data:
    31         for item in data:
    32             # print(item)
    33             image_list = item.get('image_list')
    34             title = item.get('title')
    35             # print(image_list)
    36             for image in image_list:
    37                 yield {
    38                     'image': image.get('url'),
    39                     'title': title
    40                 }
    41 
    42 def save_image(item):
    43     if not os.path.exists(item.get('title')):
    44         os.mkdir(item.get('title'))
    45     try:
    46         local_image_url = item.get('image')
    47         new_image_url = local_image_url.replace('list','large')
    48         response = requests.get('http:' + new_image_url)
    49         if response.status_code == 200:
    50             file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
    51             if not os.path.exists(file_path):
    52                 with open(file_path, 'wb')as f:
    53                     f.write(response.content)
    54             else:
    55                 print('Already Downloaded', file_path)
    56     except requests.ConnectionError:
    57         print('Failed to save image')
    58 
    59 def main(offset):
    60     json = get_page(offset)
    61     for item in get_images(json):
    62         print(item)
    63         save_image(item)
    64 
    65 if __name__ == '__main__':
    66     pool = Pool()
    67     groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    68     pool.map(main, groups)
    69     pool.close()
    70     pool.join()
  • 相关阅读:
    各种数据库查询表及表信息的SQL
    多维表头的DataGridView
    SQLite入门笔记
    配置WCF的心得
    JS键盘的键码
    ASP.NET的URL过滤
    利用反射查看类成员
    一个简单的MVC示例
    一个日志类 LogUtil
    一个IniHelper
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9123226.html
Copyright © 2011-2022 走看看