zoukankan      html  css  js  c++  java
  • 爬虫实例——爬取淘女郎相册(通过selenium、PhantomJS、BeautifulSoup爬取)

    环境

    操作系统:CentOS 6.7 32-bit

    Python版本:2.6.6

    第三方插件

    selenium

    PhantomJS

    BeautifulSoup

    代码

    # -*- coding: utf-8 -*-
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    '''
    作者:昨夜星辰
    '''
    import re
    import os
    import time
    import shutil
    import requests
    import subprocess
    from bs4 import BeautifulSoup
    from selenium import webdriver
    
    # 拼接url
    def joint_url(string):
        return 'https:' + string
    
    # 判断文件夹是否存在,如果存在就删除,否则就创建。
    def create_folder(path):
        if os.path.exists(path):
            if os.path.isdir(path):
                shutil.rmtree(path)
            else:
                os.remove(path)
        os.mkdir(path)
    
    root_folder = '淘女郎'
    create_folder(root_folder)
    url = 'https://mm.taobao.com/json/request_top_list.htm?page=1'
    browser = webdriver.PhantomJS()
    browser.get(url)
    bs = BeautifulSoup(browser.page_source, 'lxml')
    for top in bs('p', 'top'):
        mm_url = joint_url(top.find('a')['href'])
        mm_name = top.find('a').text
        mm_age = top.find('em').text
        mm_city = top.find('span').text
        mm_folder = '%s/%s' % (root_folder, mm_name)
        create_folder(mm_folder)
        print '发现一位美眉,她叫做%s,今年%s,住在%s,现在开始爬取她的个人页面……' % (mm_name, mm_age, mm_city)
        browser.get(mm_url)
        bs1 = BeautifulSoup(browser.page_source, 'lxml')
        base_info = bs1.find('ul', 'mm-p-info-cell clearfix')
        info_list = base_info('span')
        result = []
        result.append('昵称:' + info_list[0].text)
        result.append('生日:' + info_list[1].text.strip())
        result.append('所在城市:' + info_list[2].text)
        result.append('职业:' + info_list[3].text)
        result.append('血型:' + info_list[4].text)
        result.append('学校/专业:' + info_list[5].text)
        result.append('风格:' + info_list[6].text)
        result.append('身高:' + base_info.find('li', 'mm-p-small-cell mm-p-height').find('p').text)
        result.append('体重:' + base_info.find('li', 'mm-p-small-cell mm-p-weight').find('p').text)
        result.append('三围:' + base_info.find('li', 'mm-p-small-cell mm-p-size').find('p').text)
        result.append('罩杯:' + base_info.find('li', 'mm-p-small-cell mm-p-bar').find('p').text)
        result.append('鞋码:' + base_info.find('li', 'mm-p-small-cell mm-p-shose').find('p').text)
        print '资料收集完毕,正在保存她的个人资料……'
        filename = '%s/%s.txt' % (mm_folder, mm_name)
        with open(filename, 'w') as f:
            f.write('
    '.join(result))
        print '保存完毕!现在开始爬取她的个人相册……'
        album_menu_url = joint_url(bs1.find('ul', 'mm-p-menu').find('a')['href'])
        browser.get(album_menu_url)
        time.sleep(3)
        bs2 = BeautifulSoup(browser.page_source, 'lxml')
        album_number = 1
        for album_info in bs2('div', 'mm-photo-cell-middle'):
            album_url = joint_url(album_info.find('h4').find('a')['href'])
            album_name = album_info.find('h4').find('a').text.strip()
            album_size = album_info.find('span', 'mm-pic-number').text
            print '现在开始爬取她的第%d个相册,相册名为:《%s》%s……' % (album_number, album_name, album_size)
            browser.get(album_url)
            js1 = 'return document.body.scrollHeight'
            js2 = 'window.scrollTo(0, document.body.scrollHeight)'
            old_scroll_height = 0
            while(browser.execute_script(js1) > old_scroll_height):
                old_scroll_height = browser.execute_script(js1)
                browser.execute_script(js2)
                time.sleep(3)
            bs3 = BeautifulSoup(browser.page_source, 'lxml')
            photo_number = 1
            for photo_area in bs3('div', 'mm-photoimg-area'):
                print '现在开始下载她这个相册的第%d张图片……' % photo_number,
                photo_url = joint_url(photo_area.find('a')['href'])
                browser.get(photo_url)
                bs4 = BeautifulSoup(browser.page_source, 'lxml')
                big_img_url = joint_url(bs4.find('img', id='J_MmBigImg')['src'])
                content = requests.get(big_img_url).content
                filename = '%s/%d.jpg' % (mm_folder, photo_number)
                with open(filename, 'wb') as f:
                    f.write(content)
                print '下载完毕!'
                photo_number += 1
            album_number += 1
  • 相关阅读:
    学习进度条--第十四周
    第二次团队冲刺--9
    第二次团队冲刺--8
    课堂练习--最少花费的购买书籍
    第二次团队冲刺--7
    第二次团队冲刺--6
    第二次团队冲刺--5
    软工第四周进度表
    软工第四周过程总结
    个人作业之二柱子四则运算2升级版
  • 原文地址:https://www.cnblogs.com/yestreenstars/p/5564025.html
Copyright © 2011-2022 走看看