zoukankan      html  css  js  c++  java
  • 今日头条网页图片爬取

     1 import requests,os,json,re
     2 from urllib import request
     3 from day3.mysql_text import mysql_conn
     4 for i in range(0,60,20):
     5     url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(i)
     6     print(url)
     7 
     8     headers = {
     9         'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
    10     }
    11 
    12     response = requests.get(url,headers=headers)
    13     html_json_dict = response.json()
    14 
    15 
    16     # 获取dict中的data key对应的列表
    17 
    18 
    19 
    20     data_list = html_json_dict['data']
    21 
    22     # 获取列表中含有article_url的值
    23     for data_item in data_list:
    24         if 'article_url' in data_item:
    25             article_url = data_item['article_url']
    26 
    27             response = requests.get(article_url,headers=headers)
    28 
    29             html_str = response.text
    30             pattern = r'gallery: JSON.parse((.*)),'
    31 
    32             match_res = re.search(pattern, html_str)
    33 
    34             # 新建文件夹
    35             if not os.path.exists('downloads'):
    36                 os.mkdir('downloads')
    37 
    38             if match_res:
    39                 # print(match_res.group(1))
    40                 json_origin = match_res.group(1)
    41                 a1 = json.loads(json_origin)
    42                 # print(a1,type(a1))
    43                 a2 = json.loads(a1)
    44                 # print(a2['sub_images'])
    45                 for a2_list in a2['sub_images']:
    46                     image_url = a2_list['url']
    47 
    48                     filename = 'downloads/' + image_url.split('/')[-1] + '.jpg'
    49                     print(filename)
    50                     request.urlretrieve(image_url, filename)
    51 
    52             else:
    53                 pass
  • 相关阅读:
    计算机术语
    【转】 物理内存和线性空间
    windows Visual Studio 上安装 CUDA【转载】
    windows Notepad++ 上配置 vs 编译器 , 编译并运行
    单列模式 [转载]
    Java Swing布局管理器GridBagLayout的使用示例 [转]
    五年java工作应具备的技能
    三年java软件工程师应有的技技能
    京东面试题 Java相关
    京东笔试题总结
  • 原文地址:https://www.cnblogs.com/daihao9527/p/9490909.html
Copyright © 2011-2022 走看看