zoukankan      html  css  js  c++  java
  • python爬取站长之家植物图片

     1 from lxml import etree
     2 from urllib import request
     3 import urllib.parse
     4 import time
     5 import os
     6 
     7 
     8 def handle_request(url,page):
     9     if page == 1:
    10         url = url.format('')
    11     else:
    12         url = url.format('_'+str(page))
    13     headers = {
    14         "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    15     }
    16     request = urllib.request.Request(url=url,headers=headers)
    17 
    18     return request
    19 
    20 def download_img(image_src):
    21     dirpath = r'G:/untitled/zhiwu'
    22     if not os.path.exists(dirpath):
    23         os.mkdir(dirpath)
    24     # 文件名
    25     filename= os.path.basename(image_src)
    26     # 文件路径
    27     filepath = os.path.join(dirpath, filename)
    28     # 发送请求保存图片
    29     headers = {
    30         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    31     }
    32     request = urllib.request.Request(url=image_src, headers=headers)
    33     response = urllib.request.urlopen(request)
    34     print(response)
    35     with open(filepath,'wb') as fp:
    36         fp.write(response.read())
    37 
    38 def parse_content(content):
    39     # 解析内容,获取图片
    40     tree = etree.HTML(content)
    41     image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src')
    42     for image_src1 in image_list:
    43         image_src = image_src1.split('/')[-1]
    44         download_img(image_src)
    45 
    46 def main():
    47     url = 'http://sc.chinaz.com/tupian/huadetupian{}.html'
    48     start_page = int(input('请输入起始页码:'))
    49     end_page = int(input('请输入结束页码:'))
    50     for page in range(start_page, end_page + 1):
    51         request = handle_request(url, page)
    52         content = urllib.request.urlopen(request).read().decode()
    53         parse_content(content)
    54         time.sleep(1)
    55 
    56 
    57 if __name__ == '__main__':
    58     main()
  • 相关阅读:
    uva 10369 Arctic Network
    uvalive 5834 Genghis Khan The Conqueror
    uvalive 4848 Tour Belt
    uvalive 4960 Sensor Network
    codeforces 798c Mike And Gcd Problem
    codeforces 796c Bank Hacking
    codeforces 768c Jon Snow And His Favourite Number
    hdu 1114 Piggy-Bank
    poj 1276 Cash Machine
    bzoj 2423 最长公共子序列
  • 原文地址:https://www.cnblogs.com/erlchixiha/p/11805319.html
Copyright © 2011-2022 走看看