zoukankan      html  css  js  c++  java
  • 爬虫实例——爬取中关村美女频道照片

    # -*- coding: utf-8 -*-
    import os
    import re
    import shutil
    import requests
    from bs4 import BeautifulSoup
    
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    def get_soup(url):
        text = requests.get(url).text
        return BeautifulSoup(text, 'lxml')
        
    def mkdir(path):
        if os.path.exists(path):
            while True:
                key = raw_input('%s已存在,继续操作将删除它,是否继续?(Y/N)' % path)
                if key == 'Y':
                    break
                elif key == 'N':
                    exit()
            if os.path.isdir(path):
                shutil.rmtree(path)
            else:
                os.remove(path)
        os.mkdir(path)
        
    def download_all_albums_of_current_page(url):
        prefix = 'http://desk.zol.com.cn'
        soup = get_soup(url)
        url_of_next_page = prefix + soup.find('a', id='pageNext').get('href')
        for a in soup.find('ul', 'pic-list2  clearfix').find_all('a'):
            url = prefix + a.get('href')
            soup = get_soup(url)
            albums_name = soup.find('a', id='titleName').get_text()
            print '正在下载《%s》相册……' % albums_name
            storage_path_of_picture = storage_path_of_albums + '/' + albums_name
            mkdir(storage_path_of_picture)
            count = 1
            for a in soup.find('ul', id='showImg').find_all('a'):
                url = prefix + a.get('href')
                soup = get_soup(url)
                url = soup.find('img', id='bigImg').get('src')
                content = requests.get(url).content
                suffix = re.search(r'.*(..*)', url).group(1)
                filename = storage_path_of_picture + '/' + str(count) + suffix
                print '正在下载第%d张照片……' % count
                with open(filename, 'wb') as f:
                    f.write(content)
                count += 1
        while True:
            key = raw_input('当前页面已经下载完了,是否继续下载下一页?(Y/N)')
            if key == 'Y':
                download_all_albums_of_current_page(url_of_next_page)
            elif key == 'N':
                break
        
    if __name__ == '__main__':
        storage_path_of_albums = './picture'
        mkdir(storage_path_of_albums)
        
        url = 'http://desk.zol.com.cn/meinv/'
        download_all_albums_of_current_page(url)
  • 相关阅读:
    django 我的博客 (慕课网视频)笔记
    读 django 中文文档投票例子笔记
    django的安装和初步使用
    Debug模式自定义NSlog
    重写NSString的setter方法
    iOS 常用代码之 UICollectionView
    生成100个 "20180520" 这样的时间字符串 写入txt文件
    WRNavigationBar 使用记录
    关于iphone设置显示模式为标准模式和放大模式时的区别
    CGContextRef 使用小记
  • 原文地址:https://www.cnblogs.com/yestreenstars/p/5489691.html
Copyright © 2011-2022 走看看