zoukankan      html  css  js  c++  java
  • Node.js爬取百度图片瀑布流,使用class类封装。

    //爬取百度高清图片
    const phantom = require('phantom')
    const express = require('express');
    const app = express();
    const fs= require('fs');
    const cheerio = require('cheerio');
    const request = require('request')
    let server = app.listen(2000, function () {
        let host = server.address().address;
        let port = server.address().port;
        console.log('Your App is running at http://%s:%s', host, port);
    });
    
    class stealData {
    
        constructor() {
            // this.base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%B6%AF%C2%FE%B1%DA%D6%BD&fr=ala&ala=1&pos=0&alatpl=wallpaper&oriquery=%E5%8A%A8%E6%BC%AB%E5%A3%81%E7%BA%B8'; //要爬取的网站
            // this.base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&oq=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&rsp=-1'
            this.base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&oq=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&rsp=-1'
    
            this.current_page = 1;
            this.result_list = [];
            this.a=''
        }
        async init() {
            const instance = await phantom.create();//创建一个实例
            try {
                await this.openNet()//打开网页
                await this.getLoadPictures();//获取高清图片地址
                await this.imgSave(0);//下载图片
                await instance.exit()//图片下完之后退出phantomjs环境
            } catch (e) {
                console.log(e);
            }
        }
    
        sleep(time) {
            return new Promise((resolve) => {
                console.log(`自动睡眠中,${time / 1000}秒后重新发送请求......`)
                setTimeout(() => {
                    resolve();
                }, time);
            });
        }//请求延迟时间,防止ip被封
        delay(second) {
            return new Promise((resolve) => {
                setTimeout(resolve, second * 1000);
            });
        }//延迟时间
        async openNet(){
            await this.pageScroll(0)
        }
    
        async pageScroll(i) {
            const instance = await phantom.create();//创建一个实例
            const page = await instance.createPage();
            const that = this
            const status = await page.open(this.base_url);//打开网站,返回的是一个状态
            //网页加载的初始浏览器窗口大小
            const size = await page.property('viewportSize', {
                 1920,
                height: 1080
            })
            await that.delay(5)
            //滚动浏览器
            await page.property('scrollPosition', {
                left: 0,
                top: 1000 * i
            })
    
            let content = await page.property('content')//获取dom元素
            let $ = cheerio.load(content)
            console.log($('.imgbox').length)
            //如果图片数量少于20个 就一直往下滚
            if($('.imgbox').length < 20) {
                await this.pageScroll(++i)//回调自己 知道满足条件。也可使用for循环 可能速度有点慢
            }
            //所有的存储图片的dom元素都放在这里面
            this.a = $('.imgitem')
        }
        //获取到缩略图集合
        async getLoadPictures(a) {
            const result_list = [];
            let instance = await phantom.create();
            let page = await instance.createPage();
            let content = await page.property('content')
            // let status = await page.open(this.base_url)
            let $ = cheerio.load(content)
            const that = this
            //把所有的高清大图链接都统计起来
            this.a.each((index, element) => {
                result_list.push({
                    // title: $(element).find('.imgbox a').text(),
                    down_loda_url: ('https://image.baidu.com'+$(element).find('a').attr('href'))
                });
            });
            that.result_list.push(...result_list);
            // console.log('that.result_list',that.result_list[0])
        }
        //取到高清图链接 下载
        async  imgSave(i) {
            let instance = await phantom.create();
            let page = await instance.createPage();
            let status = await page.open(this.result_list[i].down_loda_url)//打开高清大图链接
            await this.delay(2)
            let content = await page.property('content')
            let $ = cheerio.load(content)
            let src = $('#currentImg').attr('src')//获取高清大图的链接
            console.log('statue',status)
            console.log('src',src)
            this.save(src,i)//保存图片
            if(i<this.result_list.length) {
                await this.imgSave(++i)//回掉自己 保存下一张图片。也可通过for循环 取到所有大图的链接 再集中下载 可能速度有点慢
    
            }
        }
        //保存图片函数
        save(url,i) {
           console.log(`开始写入第${i+1}张`)
            let ext = url.split('.').pop()
            request(url).pipe(fs.createWriteStream(`./image/${new Date().getTime()}.${ext}`));
            console.log(`写入成功`)
        }
    
    
    }
    
    const thief = new stealData('xxx_url');
    thief.init();

    做了一些优化

    //爬取百度高清图片
    const phantom = require('phantom')
    const express = require('express');
    const app = express();
    const fs= require('fs');
    const cheerio = require('cheerio');
    const request = require('request')
    let server = app.listen(2000, function () {
        let host = server.address().address;
        let port = server.address().port;
        console.log('Your App is running at http://%s:%s', host, port);
    });
    
    class stealData {
    
        constructor() {
            this.base_url = 'https://image.baidu.com/search/index?ct=201326592&z=&tn=baiduimage&word=%E6%BC%AB%E5%A8%81%E5%9B%BE%E7%89%87&pn=0&ie=utf-8&oe=utf-8&cl=2&lm=-1&fr=ala&se=&sme=&width=1920&height=1080'
            this.current_page = 1;
            this.result_list = [];
            this.a='';
            this.urllist = []
        }
        async init() {
            const instance = await phantom.create();//创建一个实例
            try {
                await this.openNet()//打开网页
                await this.getLoadPictures();//获取缩略图图片地址
                await this.getrealPictures();//获取高清图片地址并下载
            } catch (e) {
                console.log(e);
            }
        }
    
        sleep(time) {
            return new Promise((resolve) => {
                console.log(`自动睡眠中,${time / 1000}秒后重新发送请求......`)
                setTimeout(() => {
                    resolve();
                }, time);
            });
        }//请求延迟时间,防止ip被封
        delay(second) {
            return new Promise((resolve) => {
                setTimeout(resolve, second * 1000);
            });
        }//延迟时间
        async openNet(){
            await this.pageScroll(0)
        }
    
        async pageScroll(i) {
            const instance = await phantom.create();//创建一个实例
            const page = await instance.createPage();
            const that = this
            const status = await page.open(this.base_url);//打开网站,返回的是一个状态
            //网页加载的初始浏览器窗口大小
            const size = await page.property('viewportSize', {
                 1920,
                height: 1080
            })
            await that.delay(5)
            //滚动浏览器
            await page.property('scrollPosition', {
                left: 0,
                top: 1000 * i
            })
    
            let content = await page.property('content')//获取dom元素
            let $ = cheerio.load(content)
            console.log($('.imgbox').length)
            //如果图片数量少于20个 就一直往下滚
            if($('.imgbox').length < 20) {
                await this.pageScroll(++i)//回调自己 知道满足条件。也可使用for循环 可能速度有点慢
            }
            //所有的存储图片的dom元素都放在这里面
            this.a = $('.imgitem')
        }
        //获取到缩略图集合
        async getLoadPictures(a) {
            const result_list = [];
            let instance = await phantom.create();
            let page = await instance.createPage();
            let content = await page.property('content')
            let $ = cheerio.load(content)
            const that = this
            //把所有的缩略图链接都统计起来
            this.a.each((index, element) => {
                result_list.push({
                    // title: $(element).find('.imgbox a').text(),
                    down_loda_url: ('https://image.baidu.com'+$(element).find('a').attr('href'))
                });
            });
            that.result_list.push(...result_list);
        }
        //取到每一个缩略图对应高清图链接并下载
        async  getrealPictures() {
            let instance = await phantom.create();
            let page = await instance.createPage();
            for(let i=0;i<this.result_list.length;i++){
                try {
                    let content = await page.property('content')
                    let status = await page.open(this.result_list[i].down_loda_url)
                    await this.delay(2)
                    let $ = cheerio.load(content)
                    let src = $('#currentImg').attr('src')//获取高清大图的链接
                    let ext = src.split('.').pop()
                    console.log('src',src)
                    console.log(`开始写入第${i+1}张`)
                    await request(src).pipe(fs.createWriteStream(`./Marvel/${new Date().getTime()}.${ext}`));
                    await this.sleep(3000)//防止被封
                    console.log(`写入成功`)
                     if(i==this.result_list.length-1){
                        console.log('跳出下载')
                        instance.exit()
                    }
                }catch (e) {
                    console.log('errorheyu:',e)
                }
    
            }
    
    
        }
    
    }
    
    const thief = new stealData('xxx_url');
    thief.init();

    参考:1、分分钟教你用node写个爬虫

       2、Node.js爬虫实战,爬去图片到本地

       3、爬取瀑布流网页高清图

       4、PhantomJs的用法

       5、NodeJs优秀工具之——nightmare

       6、腾讯云上Phantom用例

     

  • 相关阅读:
    122. 买卖股票的最佳时机 II
    45. 跳跃游戏 II
    134. 加油站
    55. 跳跃游戏
    714. 买卖股票的最佳时机含手续费
    121. 买卖股票的最佳时机
    860. 柠檬水找零
    开发环境安装合集(部分搬运)
    javascript的单例模式
    javascript 工厂模式
  • 原文地址:https://www.cnblogs.com/hy96/p/13606708.html
Copyright © 2011-2022 走看看