zoukankan      html  css  js  c++  java
  • Node.js爬取百度图片瀑布流,使用class类封装。

    //爬取百度高清图片
    const phantom = require('phantom')
    const express = require('express');
    const app = express();
    const fs= require('fs');
    const cheerio = require('cheerio');
    const request = require('request')
    let server = app.listen(2000, function () {
        let host = server.address().address;
        let port = server.address().port;
        console.log('Your App is running at http://%s:%s', host, port);
    });
    
    class stealData {
    
        constructor() {
            // this.base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%B6%AF%C2%FE%B1%DA%D6%BD&fr=ala&ala=1&pos=0&alatpl=wallpaper&oriquery=%E5%8A%A8%E6%BC%AB%E5%A3%81%E7%BA%B8'; //要爬取的网站
            // this.base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&oq=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&rsp=-1'
            this.base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&oq=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&rsp=-1'
    
            this.current_page = 1;
            this.result_list = [];
            this.a=''
        }
        async init() {
            const instance = await phantom.create();//创建一个实例
            try {
                await this.openNet()//打开网页
                await this.getLoadPictures();//获取高清图片地址
                await this.imgSave(0);//下载图片
                await instance.exit()//图片下完之后退出phantomjs环境
            } catch (e) {
                console.log(e);
            }
        }
    
        sleep(time) {
            return new Promise((resolve) => {
                console.log(`自动睡眠中,${time / 1000}秒后重新发送请求......`)
                setTimeout(() => {
                    resolve();
                }, time);
            });
        }//请求延迟时间,防止ip被封
        delay(second) {
            return new Promise((resolve) => {
                setTimeout(resolve, second * 1000);
            });
        }//延迟时间
        async openNet(){
            await this.pageScroll(0)
        }
    
        async pageScroll(i) {
            const instance = await phantom.create();//创建一个实例
            const page = await instance.createPage();
            const that = this
            const status = await page.open(this.base_url);//打开网站,返回的是一个状态
            //网页加载的初始浏览器窗口大小
            const size = await page.property('viewportSize', {
                 1920,
                height: 1080
            })
            await that.delay(5)
            //滚动浏览器
            await page.property('scrollPosition', {
                left: 0,
                top: 1000 * i
            })
    
            let content = await page.property('content')//获取dom元素
            let $ = cheerio.load(content)
            console.log($('.imgbox').length)
            //如果图片数量少于20个 就一直往下滚
            if($('.imgbox').length < 20) {
                await this.pageScroll(++i)//回调自己 知道满足条件。也可使用for循环 可能速度有点慢
            }
            //所有的存储图片的dom元素都放在这里面
            this.a = $('.imgitem')
        }
        //获取到缩略图集合
        async getLoadPictures(a) {
            const result_list = [];
            let instance = await phantom.create();
            let page = await instance.createPage();
            let content = await page.property('content')
            // let status = await page.open(this.base_url)
            let $ = cheerio.load(content)
            const that = this
            //把所有的高清大图链接都统计起来
            this.a.each((index, element) => {
                result_list.push({
                    // title: $(element).find('.imgbox a').text(),
                    down_loda_url: ('https://image.baidu.com'+$(element).find('a').attr('href'))
                });
            });
            that.result_list.push(...result_list);
            // console.log('that.result_list',that.result_list[0])
        }
        //取到高清图链接 下载
        async  imgSave(i) {
            let instance = await phantom.create();
            let page = await instance.createPage();
            let status = await page.open(this.result_list[i].down_loda_url)//打开高清大图链接
            await this.delay(2)
            let content = await page.property('content')
            let $ = cheerio.load(content)
            let src = $('#currentImg').attr('src')//获取高清大图的链接
            console.log('statue',status)
            console.log('src',src)
            this.save(src,i)//保存图片
            if(i<this.result_list.length) {
                await this.imgSave(++i)//回掉自己 保存下一张图片。也可通过for循环 取到所有大图的链接 再集中下载 可能速度有点慢
    
            }
        }
        //保存图片函数
        save(url,i) {
           console.log(`开始写入第${i+1}张`)
            let ext = url.split('.').pop()
            request(url).pipe(fs.createWriteStream(`./image/${new Date().getTime()}.${ext}`));
            console.log(`写入成功`)
        }
    
    
    }
    
    const thief = new stealData('xxx_url');
    thief.init();

    做了一些优化

    //爬取百度高清图片
    const phantom = require('phantom')
    const express = require('express');
    const app = express();
    const fs= require('fs');
    const cheerio = require('cheerio');
    const request = require('request')
    let server = app.listen(2000, function () {
        let host = server.address().address;
        let port = server.address().port;
        console.log('Your App is running at http://%s:%s', host, port);
    });
    
    class stealData {
    
        constructor() {
            this.base_url = 'https://image.baidu.com/search/index?ct=201326592&z=&tn=baiduimage&word=%E6%BC%AB%E5%A8%81%E5%9B%BE%E7%89%87&pn=0&ie=utf-8&oe=utf-8&cl=2&lm=-1&fr=ala&se=&sme=&width=1920&height=1080'
            this.current_page = 1;
            this.result_list = [];
            this.a='';
            this.urllist = []
        }
        async init() {
            const instance = await phantom.create();//创建一个实例
            try {
                await this.openNet()//打开网页
                await this.getLoadPictures();//获取缩略图图片地址
                await this.getrealPictures();//获取高清图片地址并下载
            } catch (e) {
                console.log(e);
            }
        }
    
        sleep(time) {
            return new Promise((resolve) => {
                console.log(`自动睡眠中,${time / 1000}秒后重新发送请求......`)
                setTimeout(() => {
                    resolve();
                }, time);
            });
        }//请求延迟时间,防止ip被封
        delay(second) {
            return new Promise((resolve) => {
                setTimeout(resolve, second * 1000);
            });
        }//延迟时间
        async openNet(){
            await this.pageScroll(0)
        }
    
        async pageScroll(i) {
            const instance = await phantom.create();//创建一个实例
            const page = await instance.createPage();
            const that = this
            const status = await page.open(this.base_url);//打开网站,返回的是一个状态
            //网页加载的初始浏览器窗口大小
            const size = await page.property('viewportSize', {
                 1920,
                height: 1080
            })
            await that.delay(5)
            //滚动浏览器
            await page.property('scrollPosition', {
                left: 0,
                top: 1000 * i
            })
    
            let content = await page.property('content')//获取dom元素
            let $ = cheerio.load(content)
            console.log($('.imgbox').length)
            //如果图片数量少于20个 就一直往下滚
            if($('.imgbox').length < 20) {
                await this.pageScroll(++i)//回调自己 知道满足条件。也可使用for循环 可能速度有点慢
            }
            //所有的存储图片的dom元素都放在这里面
            this.a = $('.imgitem')
        }
        //获取到缩略图集合
        async getLoadPictures(a) {
            const result_list = [];
            let instance = await phantom.create();
            let page = await instance.createPage();
            let content = await page.property('content')
            let $ = cheerio.load(content)
            const that = this
            //把所有的缩略图链接都统计起来
            this.a.each((index, element) => {
                result_list.push({
                    // title: $(element).find('.imgbox a').text(),
                    down_loda_url: ('https://image.baidu.com'+$(element).find('a').attr('href'))
                });
            });
            that.result_list.push(...result_list);
        }
        //取到每一个缩略图对应高清图链接并下载
        async  getrealPictures() {
            let instance = await phantom.create();
            let page = await instance.createPage();
            for(let i=0;i<this.result_list.length;i++){
                try {
                    let content = await page.property('content')
                    let status = await page.open(this.result_list[i].down_loda_url)
                    await this.delay(2)
                    let $ = cheerio.load(content)
                    let src = $('#currentImg').attr('src')//获取高清大图的链接
                    let ext = src.split('.').pop()
                    console.log('src',src)
                    console.log(`开始写入第${i+1}张`)
                    await request(src).pipe(fs.createWriteStream(`./Marvel/${new Date().getTime()}.${ext}`));
                    await this.sleep(3000)//防止被封
                    console.log(`写入成功`)
                     if(i==this.result_list.length-1){
                        console.log('跳出下载')
                        instance.exit()
                    }
                }catch (e) {
                    console.log('errorheyu:',e)
                }
    
            }
    
    
        }
    
    }
    
    const thief = new stealData('xxx_url');
    thief.init();

    参考:1、分分钟教你用node写个爬虫

       2、Node.js爬虫实战,爬去图片到本地

       3、爬取瀑布流网页高清图

       4、PhantomJs的用法

       5、NodeJs优秀工具之——nightmare

       6、腾讯云上Phantom用例

     

  • 相关阅读:
    Educational Codeforces Round 88 (Rated for Div. 2) D. Yet Another Yet Another Task(枚举/最大连续子序列)
    Educational Codeforces Round 88 (Rated for Div. 2) A. Berland Poker(数学)
    Educational Codeforces Round 88 (Rated for Div. 2) E. Modular Stability(数论)
    Educational Codeforces Round 88 (Rated for Div. 2) C. Mixing Water(数学/二分)
    Codeforces Round #644 (Div. 3)
    Educational Codeforces Round 76 (Rated for Div. 2)
    Educational Codeforces Round 77 (Rated for Div. 2)
    Educational Codeforces Round 87 (Rated for Div. 2)
    AtCoder Beginner Contest 168
    Codeforces Round #643 (Div. 2)
  • 原文地址:https://www.cnblogs.com/hy96/p/13606708.html
Copyright © 2011-2022 走看看