zoukankan      html  css  js  c++  java
  • 爬虫实例——爬取中关村美女频道照片

    # -*- coding: utf-8 -*-
    import os
    import re
    import shutil
    import requests
    from bs4 import BeautifulSoup
    
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    def get_soup(url):
        text = requests.get(url).text
        return BeautifulSoup(text, 'lxml')
        
    def mkdir(path):
        if os.path.exists(path):
            while True:
                key = raw_input('%s已存在,继续操作将删除它,是否继续?(Y/N)' % path)
                if key == 'Y':
                    break
                elif key == 'N':
                    exit()
            if os.path.isdir(path):
                shutil.rmtree(path)
            else:
                os.remove(path)
        os.mkdir(path)
        
    def download_all_albums_of_current_page(url):
        prefix = 'http://desk.zol.com.cn'
        soup = get_soup(url)
        url_of_next_page = prefix + soup.find('a', id='pageNext').get('href')
        for a in soup.find('ul', 'pic-list2  clearfix').find_all('a'):
            url = prefix + a.get('href')
            soup = get_soup(url)
            albums_name = soup.find('a', id='titleName').get_text()
            print '正在下载《%s》相册……' % albums_name
            storage_path_of_picture = storage_path_of_albums + '/' + albums_name
            mkdir(storage_path_of_picture)
            count = 1
            for a in soup.find('ul', id='showImg').find_all('a'):
                url = prefix + a.get('href')
                soup = get_soup(url)
                url = soup.find('img', id='bigImg').get('src')
                content = requests.get(url).content
                suffix = re.search(r'.*(..*)', url).group(1)
                filename = storage_path_of_picture + '/' + str(count) + suffix
                print '正在下载第%d张照片……' % count
                with open(filename, 'wb') as f:
                    f.write(content)
                count += 1
        while True:
            key = raw_input('当前页面已经下载完了,是否继续下载下一页?(Y/N)')
            if key == 'Y':
                download_all_albums_of_current_page(url_of_next_page)
            elif key == 'N':
                break
        
    if __name__ == '__main__':
        storage_path_of_albums = './picture'
        mkdir(storage_path_of_albums)
        
        url = 'http://desk.zol.com.cn/meinv/'
        download_all_albums_of_current_page(url)
  • 相关阅读:
    网络-路由交换-IPv4-Cisco-协议概要
    网络-路由交换-IPv4-Cisco-协议基础
    网络-路由交换-网络安全-华为-ACL分类
    泰克-OSPF
    网络-路由交换-网络安全-华为-DHCP基础
    不同系统下的数据参考
    model一定是和数据库表对应的么?
    NUnit属性-百度Nunit-Gui
    NUnit属性
    NUnit详细使用方法
  • 原文地址:https://www.cnblogs.com/yestreenstars/p/5489691.html
Copyright © 2011-2022 走看看