zoukankan      html  css  js  c++  java
  • 爬取图片(三)

    源码:

     1 import requests
     2 import json
     3 import re
     4 import os
     5 from urllib import request
     6 
     7 # 获取图集链接
     8 def get_urls(offset,headers):
     9     url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=3&from=gallery'.format(offset)
    10     response = requests.get(url,headers=headers)
    11     res = response.json()['data']
    12     url_list = []
    13     for i in res:
    14         if 'article_url' in i:
    15             article_url = i['article_url']
    16             url_list.append(article_url)
    17     return url_list
    18 
    19 # 下载图片
    20 def download_pictures(url, headers):
    21     try:
    22         response = requests.get(url,headers=headers)
    23         # print(response.text)
    24         print(url)
    25         pat_dir = r'<title>(.*?)</title>'
    26         dir_name = re.search(pat_dir,response.text).group(1)
    27         print(dir_name)
    28         # 正则匹配,图片地址
    29         pat = r'gallery: JSON.parse((.*?))'
    30         res = re.search(pat, response.text)
    31         res = res.group(1)
    32         json_str = json.loads(res)
    33         json_dict = json.loads(json_str)
    34         dic = json_dict['sub_images']
    35         dir_name = '街拍图/' + dir_name
    36         if not os.path.exists(dir_name):
    37             os.makedirs(dir_name)
    38 
    39         for i in dic:
    40             image_url = i['url']
    41             filename = dir_name + '/' + image_url.split('/')[-1] + '.jpg'
    42             if not os.path.exists(filename):
    43                 print('正在下载:' + filename)
    44                 request.urlretrieve(image_url, filename)
    45     except:
    46         pass
    47 
    48 
    49 if __name__ == '__main__':
    50     headers = {
    51         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    52     }
    53     for offset in range(0,60,20):
    54         url_list = get_urls(offset,headers)
    55         for url in url_list:
    56             download_pictures(url, headers)
  • 相关阅读:
    MySQL——sql语句处理时间——时间、字符串、时间戳互相转换
    MySQL——sql语句处理时间——日期加减天数
    Spring Boot——jpaProperties.getHibernateProperties()的使用
    Spring Boot——SpringBoot2+JPA+druid配置多数据源
    Spring Boot——log4j日志配置案例
    git命令——git 分支操作
    windows如何删除默认打开方式
    excel导出出现弹框
    笔记
    javascript中三个等号"==="是什么意思
  • 原文地址:https://www.cnblogs.com/zhxd-python/p/9501326.html
Copyright © 2011-2022 走看看