zoukankan html css js c++ java

爬虫实例——爬取中关村美女频道照片

# -*- coding: utf-8 -*-
import os
import re
import shutil
import requests
from bs4 import BeautifulSoup

import sys
reload(sys)
sys.setdefaultencoding('utf8')

def get_soup(url):
    text = requests.get(url).text
    return BeautifulSoup(text, 'lxml')
    
def mkdir(path):
    if os.path.exists(path):
        while True:
            key = raw_input('%s已存在，继续操作将删除它，是否继续？（Y/N）' % path)
            if key == 'Y':
                break
            elif key == 'N':
                exit()
        if os.path.isdir(path):
            shutil.rmtree(path)
        else:
            os.remove(path)
    os.mkdir(path)
    
def download_all_albums_of_current_page(url):
    prefix = 'http://desk.zol.com.cn'
    soup = get_soup(url)
    url_of_next_page = prefix + soup.find('a', id='pageNext').get('href')
    for a in soup.find('ul', 'pic-list2  clearfix').find_all('a'):
        url = prefix + a.get('href')
        soup = get_soup(url)
        albums_name = soup.find('a', id='titleName').get_text()
        print '正在下载《%s》相册……' % albums_name
        storage_path_of_picture = storage_path_of_albums + '/' + albums_name
        mkdir(storage_path_of_picture)
        count = 1
        for a in soup.find('ul', id='showImg').find_all('a'):
            url = prefix + a.get('href')
            soup = get_soup(url)
            url = soup.find('img', id='bigImg').get('src')
            content = requests.get(url).content
            suffix = re.search(r'.*(..*)', url).group(1)
            filename = storage_path_of_picture + '/' + str(count) + suffix
            print '正在下载第%d张照片……' % count
            with open(filename, 'wb') as f:
                f.write(content)
            count += 1
    while True:
        key = raw_input('当前页面已经下载完了，是否继续下载下一页？（Y/N）')
        if key == 'Y':
            download_all_albums_of_current_page(url_of_next_page)
        elif key == 'N':
            break
    
if __name__ == '__main__':
    storage_path_of_albums = './picture'
    mkdir(storage_path_of_albums)
    
    url = 'http://desk.zol.com.cn/meinv/'
    download_all_albums_of_current_page(url)

查看全文

相关阅读:
排序——选择排序和插入排序
 排序——排序的基本概念
 字符串类——KMP算法的应用
 字符串类——KMP子串查找算法
 字符串类——字符串类的创建（下）
字符串类——字符串类的创建（上）
数据结构库——链式队列的实现
 P4180 【模板】严格次小生成树[BJWC2010]
P2511 [HAOI2008]木棍分割
 P2613 【模板】有理数取余

原文地址：https://www.cnblogs.com/yestreenstars/p/5489691.html