zoukankan      html  css  js  c++  java
  • 使用nodejs+http(s)+events+cheerio+iconv-lite爬取2717网站图片数据到本地文件夹

    源代码如下:

     
    //(node:9240) Warning: Setting the NODE_TLS_REJECT_UNAUTHORIZED environment variable to '0' makes TLS connections and HTTPS requests insecure by disabling certificate verification.
    //解决 javascript – Node.js请求CERT_HAS_EXPIRED问题,下面这句置首
    // process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
    //end
    
    let http = require("http");
    let https = require("https");
    let iconv = require("iconv-lite");
    let cheerio = require("cheerio");
    let path = require('path');
    let fs = require('fs');
    const phantom = require('phantom');
    
    let EventEmitter = require('events').EventEmitter;
    
    class MyEmitter extends EventEmitter {
    }
    
    const myEmitter = new MyEmitter();
    myEmitter.setMaxListeners(0);
    
    // const util = require('util');
    
    const request = require('request');
    //var url = "https://www.baidu.com/";
    //const getPromise = util.promisify(request.get);
    const userAgents = [
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
        'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
        'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    ];
    
    //选择器模板
    let selector_temple = [{"normal": "#picBody > p > a > img",
        "fix": "#picBody > p >  img"
    },
        {"normal": "#picBody > center > a > img",
            "fix": "#picBody > center > img"
        },
        {"normal": "#contentV3_article > div.contentV3_body > p > a > img",
            "fix": "#contentV3_article > div.contentV3_body > p > img"
        }
    ];
    
    /**
     * 异步延迟
     * @param {number} time 延迟的时间,单位毫秒
     */
    function sleep(time = 0) {
        return new Promise((resolve, reject) => {
            setTimeout(() => {
                resolve();
            }, time);
        })
    };
    
    
    class Spider2717 extends EventEmitter {
    
    
        constructor(_starturl = 'https://www.2717.com/ent/meinvtupian/2019/316305.html',
                    // _selector = 'div.w1200.yh >div.MeinvTuPianBox > ul > li>a>i>img',
                    _type = 'meinv',
                    _nextpage = 1,
                    _lastpage = 1
                    //_fix_selector = '#picBody > p > img'
        ) {
    
            super()
            // this._emitter = myEmitter;
            //src,title,flag:当前页面图片的src,和title及下载完成标志
    
            this.data = [];
            this.starturl= _starturl; //起始页url前半部分
            //this.selector = _selector;//提取数据选择器字符串
            //this.fix_selector = _fix_selector;//补丁选择器
            this.nextpage = _nextpage;//开始抓取页面
            this.lastpage = _lastpage; //最后抓取页面
            this.type = _type; //图片类型:meinv(243),meishi(199),stars(16),wenshen(380),zhiwu(100)
            //初始化保存图片目录
            let i1 = this.starturl.lastIndexOf("https://www.2717.com/") + "https://www.2717.com/".length;
            let i2 = this.starturl.lastIndexOf("/")
            let tmpstr = this.starturl.substring(i1, i2);
    
            //this.savedir = path.join('imgs', this.type).toString();
            this.savedir = path.join('imgs',  this.type,tmpstr).toString();
            console.log("savedir:" + this.savedir);
            //if (!fs.existsSync(this.savedir)) {
            // 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。
            fs.mkdirSync(this.savedir, {recursive: true}, (err) => {
                if (err) throw err;
            });
            //}
    
            //当前选择器模板序号
            //let select_type = 0; //!!!!!!!!!!!!!!!!!!!
            //下载html页面数据失败标志
            this.get_html_flag = true;
            //下载图片页完成计数器
            this.downloaded_imagepage_count = 0;
            //下载图片单个页面事件名称
            this.download_onepage_event = "download_onepage_event";
    
        }
    
        /**
         * 获取指定url中的html文本内容
         * @param url
         * @param no
         * @param event_name:html,etc
         */
        spidermeinvtupian(url, pno, event_name = 'html') {
    
            let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
            let req = request({
                url: url,
                UserAgent: userAgent,
                timeout: 5000,
                encoding: null //设置encoding
            }, function (error, response, body) {
                if (!error && response.statusCode == 200) {
                    let html = iconv.decode(body, 'gbk').toString(); //解码gb2312
                    this.get_html_flag = true;
                    myEmitter.emit(event_name, html, pno);
    
                } else {
                    console.log("获取 " + url + " 失败!--"+error.message);
    
                    this.get_html_flag = false;
                    let html = '';
                    myEmitter.emit(event_name, html, pno);
                }
            });
        }
    
        /**
         * 从html文本中获取图片src和atl
         * @param html
         * @param pno
         */
        getTupianData(html, pno, event_name = 'images') {
            //body > div.w1200.yh > div.MeinvTuPianBox > ul > li:nth-child(1) > a.MMPic/
            const $ = cheerio.load(html);
            //美女图片
            //修正未页网页选择器不同与其它页面的选择器不一致的问题(可点击和不可点击的区别
            // console.log("selector:" + this.selector);
            //let imgs = $('#picBody > p > a > img').toArray();
            //#picBody > p > img
    
            let imgs = [];
    
            for (let i = 0; i < selector_temple.length; i++) {
                //尝试normal selector
                imgs = $(selector_temple[i]['normal']).toArray();
                console.log("selector:" + selector_temple[i]['normal']);
                if (imgs.length > 0) break;
                //尝试fix selector
    
                imgs = $(selector_temple[i]['fix']).toArray();
                console.log("selector:" + selector_temple[i]['fix']);
                if (imgs.length > 0) break;
    
            }
            console.log("total page1:" + imgs.length);
    
    
            for (let i = 0; i < imgs.length; i++) {
                let src = $(imgs[i]).attr('src');
                let title = $(imgs[i]).attr("alt");
                //增加文件下载标志,true:已完成下载,false:没有下载
                //let flag = false;
                this.data.push({src, title});
                // console.log(typeof (this.data.flag));
            }
            // myEmitter.emit("images", this.data, pno);
            myEmitter.emit(event_name, this.data, pno);
            //  this.emit("images", data, pno);
    
        }
    
        /**
         * 根据抓取的图片src和alt下载图片数据
         * @param data
         * @param pno
         */
        downloadphoto(data, pno) {
            for (let i = 0; i < data.length; i++) {
                data[i].title = data[i].title.replace(new RegExp("/", 'g'), '_');
                data[i].title = data[i].title.replace(new RegExp("\\", 'g'), '_');
                data[i].title = data[i].title.replace(new RegExp('<', 'g'), '_');
                data[i].title = data[i].title.replace(new RegExp('>', 'g'), '_');
                data[i].title = data[i].title.replace('|', '_');
    
                this.downloadfile(data[i].src, data[i].title, i, pno);
            }
        }
    
    
        /**
         * 随机延迟下载图片文件
         * @param src
         * @param title
         * @param no 当前页面第no个图片文件
         * @param delaytime
         * @param pno 当前页面号
         */
    
        /*
        require('https').get({
        secure: true,
        host: 'github.com',
        method: 'GET',
        path: '/downloads/Graylog2/graylog2-web-interface/graylog2-web-interface-0.9.6.tar.gz',
        'headers': {
            Host: 'github.com'
        }}).on('response', function(response) {
        console.log(response.statusCode);
        });
        */
        /**
         * 用NodeJs实现获取301或302跳转后的URL
         * @param link
         * @param collback
         * https://calfgz.github.io/blog/2018/05/http-redirect-java-node.html
    
         find_link(link, collback) {
    
            var f = function (link) {
                var options = {
                    url: link,
                    followRedirect: false,
                    headers: {
                        'Content-Type': 'application/x-www-form-urlencoded',
                        'Accept-Charset': 'UTF-8;',
                        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.8) Firefox/3.6.8',
                    }
                }
    
                request(options, function (error, response, body) {
                    console.log(response.statusCode);
                    if (response.statusCode == 301 || response.statusCode == 302) {
                        var location = response.headers.location;
                        console.log('location: ' + location);
                        f(location);
                    } else {
                        //console.log(body);
                        collback(link);
                    }
                })
            }
    
            f(link);
        }
    
         // find_link("http://a.m.taobao.com/i538372076663.htm?&sid=7ac494a5aa270ce9562feadef7423650", function(link){
        //     console.log(link);
        // });
    
         */
        calldownload=(src, no, filename, delaytime)=> {
            //src 非法
            if (src == undefined || src.length == 0) {
                //跳过,继续下一个图片下载
                console.log(`下载图片src':${src} '非法,跳过下载,继续下一个`);
                // this.data[no].flag = true;
                myEmitter.emit(this.download_onepage_event, "fail", no);
                return;
            }
            let time = 0;
    
            time = Math.random() * delaytime;
    
            let timeout=setTimeout(() => {
                let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
                var options = {
                    url: src,
                    followRedirect: false,
                    headers: {
                        'Content-Type': 'application/x-www-form-urlencoded',
                        'Accept-Charset': 'UTF-8;',
                        'User-Agent': userAgent
                    }
                }
                if (src.startsWith("https")) {
    
                    https.get(src, options, res => {
    
                        // console.log(filename);
                        let writer = fs.createWriteStream(filename);
                        res.pipe(writer);
                        res.on("end", () => {
                            if (res.statusCode == 200) {
                                console.log(new Date().toLocaleString() + ",完成下载:" + filename);
                                //this.data[no].flag = true;
                                myEmitter.emit(this.download_onepage_event, "ok", no);
                            } else if (res.statusCode == 301 || res.statusCode == 302) {
                                console.log("未完成下载:" + filename + ",https返回值:" + res.statusCode);
                                //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
                                let location = res.headers.location;
                                console.log("正在重新跳转正确的URL进行下载:" + location);
                                // console.log('src: ' + src);
                                this.calldownload(location, no, filename);
                            } else { //文件下载失败,提示并跳过下载
                                console.log("下载:" + filename + " 失败,https返回值:" + res.statusCode);
                                //跳过,继续下一个图片下载
                                //this.data[no].flag = true;
                                myEmitter.emit(this.download_onepage_event, "fail", no);
                            }
                        });
                        res.on('error',(err)=>{
                            console.log("download_onepage_event:failed"+err.message);
                            myEmitter.emit(this.download_onepage_event, "fail", no);
                        });
    
                    });
                } else if (src.startsWith("http")) {
    
                    http.get(src, res => {
                        // let filename = path.join('imgs', title + path.extname(src));
                        //console.log(filename);
                        let writer = fs.createWriteStream(filename);
                        res.pipe(writer);
                        res.on("end", () => {
                            if (res.statusCode == 200) {
                                console.log(new Date().toLocaleString() + ",完成下载:" + filename);
                                //this.data[no].flag = true;
                                myEmitter.emit(this.download_onepage_event, "ok", no);
                            } else if (res.statusCode == 301 || res.statusCode == 302) {
                                console.log("未完成下载:" + filename + ",http返回值:" + res.statusCode);
                                //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
                                let location = res.headers.location;
                                console.log("正在重新跳转正确的URL进行下载:" + location);
                                this.calldownload(location, no, filename);
                            } else { //文件下载失败,提示并跳过下载
                                console.log("下载:" + filename + " 失败,https返回值:" + res.statusCode);
                                //跳过,继续下一个图片下载
                                myEmitter.emit(this.download_onepage_event, "fail", no);
                            }
    
                        });
                        res.on('error',(err)=>{
                            console.log("download_onepage_event:failed"+err.message);
                            myEmitter.emit(this.download_onepage_event, "fail", no);
                        });
                    });
                }
                clearTimeout((timeout));
            }, time);
        };
    
        /**
         * 根据src,title,no,pno等参数进行下载图片文件到本地
         * @param src
         * @param title
         * @param no
         * @param delaytime
         * @param pno
         */
        downloadfile=(src, title, no, pno)=> {
    
    
            try {
                //  src= src.replace('https','http');
                console.log("src:" + src);
    
                //let filename = path.join(this.savedir, title,pno + path.extname(src));
                //if (!fs.existsSync(this.savedir)) {
                // 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。
                let dirpath=path.join(this.savedir,title).toString();
                fs.mkdirSync(dirpath, {recursive: true}, (err) => {
                    if (err) throw err;
                });
                //}*/
                let filename = path.join(this.savedir, title,pno + path.extname(src));
                //如果本地文件存在则跳过,不再下载
                if (fs.existsSync(filename)) {
    
                    let stat = fs.statSync(filename);
                    if (stat.size > 1024) {
                        //跳过,继续下一个图片下载
                        console.log("本地文件:" + filename + "已经存在,系统跳过下载");
                        //    this.data[no].flag = true;
                        myEmitter.emit(this.download_onepage_event, "ingore", no);
                        return;
                    }
                }
                console.log(new Date().toLocaleDateString() + ",正在下载:" + filename);
                //
                this.calldownload(src, no, filename, 100);
    
            } catch (e) {
                console.log(e);
                //  this.data[no].flag = flag;
                myEmitter.emit(this.download_onepage_event, "ingore", no);
            }
    
    
        };
    
    
        /**
         * 开启抓取图片数据
         */
        startSpider=()=> {
    
    
            //注册自定义监听事件
            // 根据html获取图片src,art
            myEmitter.on("html", (html, pno) => {
                // this.on("html", (html, pno) => {
                //  console.log("html:", html, pno);
                this.getTupianData(html, pno);
            });
    
            //根据图片src,alt,及指定页面下载图片到本地
            myEmitter.on("images", (data, pno) => {
                // this.on("images", (data, pno) => {
                //   console.log("images:", data, pno);
                this.downloadphoto(data, pno);
            });
    
            //下载图片页完成计数器
            this.downloaded_imagepage_count = 0;
            this.data = [];
            //下载图片单个页面事件名称
            // this.download_onepage_event="download_onepage_event";
            myEmitter.on(this.download_onepage_event, (status, pno) => {
                console.log("download_onepage_event=>status:"+status);
                this.downloaded_imagepage_count++;
                if (this.downloaded_imagepage_count >= this.data.length) {
    
                    console.log("某单页图片数据抓取完毕!");
                    this.downloaded_imagepage_count = 0;
                    this.data = [];
    
    
                    this.nextpage++;
                    if (this.nextpage  <= this.lastpage) {
    
                        console.log("开启第" + this.nextpage + "页面数据抓取。。。。。。。。。。。。。。。");
                        this.spiderpage(this.nextpage);
    
                    } else {
                        console.log("所有页面图片数据抓取完毕!");
                        //clearInterval(interval);//停止定时器
                        myEmitter.emit("download_allpage_event","ok");
                        this.data = [];
                        myEmitter.removeAllListeners("html")
                        myEmitter.removeAllListeners("images")
                        //写标志
                        fs.writeFileSync('save.txt',"ok");
    
                    }
    
                }
            });
    
            //首先开启起始页数据下载。。。。
            console.log("开启第" + this.nextpage + "页面数据抓取。。。。。。。。。。。。。。。");
            this.spiderpage(this.nextpage)
    
        };
    
        /**
         * 开启指定页面数据抓取
         * @param pageno
         */
        spiderpage=(pageno)=> {
            let url = '';
            if(pageno===1){
                url=this.starturl;
            }
            else {
    
                url = this.starturl.substring(0, this.starturl.length - 5) + "_" + pageno + ".html";
            }
            // url = this.preurl + pageno + ".html";
    
            console.log("url:" + url);
            this.spidermeinvtupian(url, pageno);
        }
    }
    
    /**
     * 通过原生regquest模块获取指定url中文本内容
     * @param url
     * @param event_name
     */
    function get_html_by_request(url, event_name = 'get_html') {
    
        let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
        let req = request({
            url: url,
            UserAgent: userAgent,
            encoding: null, //设置encoding
            strictSSL: true
        }, function (error, response, body) {
            if (!error && response.statusCode == 200) {
                let html = iconv.decode(body, 'gbk').toString(); //解码gb2312
    
                myEmitter.emit(event_name, html);
    
            } else {
                console.log("获取 " + url + " 失败:" + response.statusCode);
                let html = '';
                myEmitter.emit(event_name, html);
            }
        });
    }
    
    
    /**
     *通过phamtomjs同步获取url对应的html内容
     * @param url
     * @returns {Promise<string|*>}
     */
    async function get_html_from_url_by_phantom(url) {
        // phantom.outputEncoding='utf-8';//指定编码方式
        const instance = await phantom.create();
        const page = await instance.createPage();
        await page.on('onResourceRequested', function (requestData) {
            console.info('Requesting', requestData.url);
        });
        //设置动态useragent
        let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
        //warn: Using page.settings = ...; is not supported. Use page.property('settings', ...) instead. See the README file for more examples of page#property.
        page.property('settings', {
            javascriptEnabled: true,
            loadImages: true,
            userAgent: userAgent
        });
    
        const status = await page.open(url);
    
        let content = await page.property('content');
        //  console.log(content);
        //  page.render('example.png');
        // await  page.close();
        await instance.exit();
    
        return content;
    
    }
    
    /**
     * 获取总页面数及其标题
     * @param html
     * @returns {number}
     */
    function getPageinfo(html) {
    
        const $ = cheerio.load(html);
    
        //获取标题
        let hs = $('div.warp.mar.oh > div.warp.oh > h1').toArray();
        let title = $(hs[0]).text();
        //
    
        //获取总页面数
        let pageinfo = '';
        let lis = $('#pageinfo').toArray();
        if (lis.length == 0) {
            pageinfo = '-1';
        } else {
            pageinfo = $(lis[0]).attr('pageinfo');
        }
    
    
        let count = Number(pageinfo);
    
        let data = {'title':title, 'count':count};
    
        data.title = title;
        data.count = count;
    
    
        return data;
    
    }
    
    //---------------------------------------------------------------------------
    /**
     * 无分页网页图片下载类
     */
    class SpiderOnePageBuff {
        /**
         * 监听一个事件的参数
         * @param _event_name
         */
        constructor(_html, _event_name, _save_dir) {
    
            //初始化保存图片目录
    
            this.savedir = _save_dir;
            // 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。
            fs.mkdirSync(this.savedir, {recursive: true}, (err) => {
                if (err) throw err;
            });
    
            this.clsname = 'SpiderOnePageBuff=>';
            this.downloaded_one_image = 'downloaded_one_image';
    
            this.html = _html;
    
            this.data = [];
            this.imgs = [];
    
            this.event_name = _event_name;
            /**
             * 处理所有图片数据完成计数器
             * @type {number}
             */
            this.process_event_finish_count = 0;
    
        }
    
        /**
         * 随机延迟下载图片文件
         * @param src
         * @param title
         * @param no 当前页面第no个图片文件
         * @param delaytime
         * @param pno 当前页面号
         */
        calldownload=(src, no, filename, delaytime)=>{
            //src 非法
            if (src == undefined || src.length == 0) {
                //跳过,继续下一个图片下载
                console.log(this.clsname + `下载图片src':${src} '非法,跳过下载,继续下一个`);
                this.data[no].flag = true;
                return;
            }
            let time = 0;
    
            time = Math.random() * delaytime;
    
           let timeout= setTimeout(() => {
                let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
                let options = {
                    url: src,
                    followRedirect: false,
                    headers: {
                        'Content-Type': 'application/x-www-form-urlencoded',
                        'Accept-Charset': 'UTF-8;',
                        'User-Agent': userAgent
                    }
                }
                if (src.startsWith("https")) {
    
                    https.get(src, options, res => {
    
                        // console.log(filename);
                        let writer = fs.createWriteStream(filename);
                        res.pipe(writer);
                        res.on("end", () => {
                            if (res.statusCode == 200) {
                                console.log(this.clsname + new Date().toLocaleString() + ",完成下载:" + filename);
                                myEmitter.emit(this.downloaded_one_image, "ok", no);
                            } else if (res.statusCode == 301 || res.statusCode == 302) {
                                console.log("未完成下载:" + filename + ",https返回值:" + res.statusCode);
                                //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
                                let location = res.headers.location;
                                console.log(this.clsname + "正在重新跳转正确的URL进行下载:" + location);
                                // console.log('src: ' + src);
                                this.calldownload(location, no, filename);
                            } else { //文件下载失败,提示并跳过下载
                                console.log(this.clsname + "下载:" + filename + " 失败,https返回值:" + res.statusCode);
                                //跳过,继续下一个图片下载
                                myEmitter.emit(this.downloaded_one_image, "fail", no);
                            }
                        });
    
                    });
                } else if (src.startsWith("http")) {
    
                    http.get(src, res => {
                        // let filename = path.join('imgs', title + path.extname(src));
                        //console.log(filename);
                        let writer = fs.createWriteStream(filename);
                        res.pipe(writer);
                        res.on("end", () => {
                            if (res.statusCode == 200) {
                                console.log(this.clsname + new Date().toLocaleString() + ",完成下载:" + filename);
                                myEmitter.emit(this.downloaded_one_image, "ok", no);
                            } else if (res.statusCode == 301 || res.statusCode == 302) {
                                console.log(this.clsname + "未完成下载:" + filename + ",http返回值:" + res.statusCode);
                                //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
                                let location = res.headers.location;
                                console.log(this.clsname + "正在重新跳转正确的URL进行下载:" + location);
                                this.calldownload(location, no, filename);
                            } else { //文件下载失败,提示并跳过下载
                                console.log(this.clsname + "下载:" + filename + " 失败,https返回值:" + res.statusCode);
                                //跳过,继续下一个图片下载
                                myEmitter.emit(this.downloaded_one_image, "fail", no);
                            }
    
                        });
                    });
                }
                clearTimeout(timeout);
            }, time);
        }
    
    
        /**
         * 预先处理标题为文件格式字符
         * @param _title
         * @returns {string}
         */
        preprocess_title(_title) {
            let title = _title;
            title = title.replace(new RegExp("\\", 'g'), '_');
            title = title.replace(new RegExp("/", 'g'), '_');
            title = title.replace(new RegExp('<', 'g'), '_');
            title = title.replace(new RegExp('>', 'g'), '_');
            title = title.replace('|', '_');
            return title;
        }
    
        /**
         * 抓取单个页面图片
         * @param html_buff
         * @param event_name
         */
    //抓取只有单个图片的页面处理函数
        spider_one_image=(event_name = 'get_one_image')=> {
    
            const $ = cheerio.load(this.html);
            //修正未页网页选择器不同与其它页面的选择器不一致的问题(可点击和不可点击的区别
            for (let i = 0; i < selector_temple.length; i++) {
                //尝试normal selector
                this.imgs = $(selector_temple[i]['normal']).toArray();
                console.log("selector:" + selector_temple[i]['normal']);
                if (this.imgs.length > 0) break;
                //尝试fix selector
    
                this.imgs = $(selector_temple[i]['fix']).toArray();
                console.log("selector:" + selector_temple[i]['fix']);
                if (this.imgs.length > 0) break;
    
            }
            console.log("spider_one_image=>total page1:" + this.imgs.length);
    
    
            for (let i = 0; i < this.imgs.length; i++) {
                let src = $(this.imgs[i]).attr('src');
                let title = $(this.imgs[i]).attr("alt");
                title = this.preprocess_title(title);
                //增加文件下载标志,true:已完成下载,false:没有下载
                //let flag = false;
                this.data.push({src, title});
            }
    
            if (this.imgs.length > 0)
                myEmitter.emit(event_name, this.imgs);
        };
    
        /**
         * 开启入口
         */
        start_spider=()=>{
            this.process_event_finish_count = 0;
            myEmitter.on('main_download_one_image', (status, no) => {
                console.log(this.clsname + "status:" + status);
                console.log("this.event_name:"+this.event_name);
                this.process_event_finish_count++;
                if (this.process_event_finish_count >= this.imgs.length) {
                    //如果完成所有文件下载(无论成功与否),则发去完成事件给回调函数
                    this.process_event_finish_count=0;
                    this.data=[];
                   myEmitter.emit("download_allpage_event", "ok");
                }
    
            });
            myEmitter.on("get_one_image", data => {
    
    
                let filename = '';
                for (let i = 0; i < this.data.length; i++) {
                    //开始下载图片文件
                    //src, no, filename, delaytime
                    //filename = path.join(this.savedir, (i + 1) + "_" + this.data[i].title + path.extname(this.data[i].src));
                    filename = path.join(this.savedir, (i + 1) + path.extname(this.data[i].src));
                    console.log(this.clsname + new Date().toLocaleDateString() + ",正在下载:" + filename);
                    //如果本地文件存在则跳过,不再下载
                    if (fs.existsSync(filename)) {
    
                        let stat = fs.statSync(filename);
                        if (stat.size > 1024) {
                            //跳过,继续下一个图片下载
                            console.log(this.clsname + "本地文件:" + filename + "已经存在,系统跳过下载");
                            myEmitter.emit("main_download_one_image", "ingore", i);
                            return;
                        }
                    }
                    this.calldownload(this.data[i].src, i, filename, 3000);
                }
    
            });
    
            this.spider_one_image();
        }
    }
    
    
    //抓取页面入口url地址
    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    // let url = 'https://www.2717.com/beautiful/zhuomianbeijing/2013/4499.html';
    
    function main(url = 'https://www.2717.com/word/dongwushijie/2018/313620.html',type) {
        //
        let pagecount = 0;
        let title = '';
        let html_buff = '';
       // let end_flag = false;
    
    
        //完成所有页面图片下载回调处理事件
        myEmitter.on("download_allpage_event",status=>{
            //开启结束标志
           //end_flag=true;
        });
        // console.log('step 1=================');
    
        //way1
        myEmitter.on('get_html', html => {
            let data = getPageinfo(html);
            pagecount = data['count'];
            title = data['title'];
    
            html_buff = html;
            console.log(title, pagecount);
    
            if (pagecount <= 0) {
                myEmitter.on("main_download_one_image", status => {
                    //下载单个图片完成!!!
                    console.log("下载单个图片完成!!!=状态" + status);
                    //写标志
                    fs.writeFileSync('save.txt',"ok");
    
                });
                //初始化保存图片目录
                let i1 = url.lastIndexOf("https://www.2717.com/") + "https://www.2717.com/".length;
                let i2 = url.lastIndexOf("/")
                let tmpstr = url.substring(i1, i2);
                let savedir = path.join('imgs', tmpstr).toString();
                if(arguments.length<=1){
                    savedir=path.join('imgs',title).toString();
                }
                else{
                    savedir=path.join('imgs',type,title).toString();
                }
    
                let spiderbuff = new SpiderOnePageBuff(html_buff,"main_download_one_image", savedir);
                spiderbuff.start_spider();
            } else { //有多个图片的tab页显示
    
                // console.log('step 2=================');
                let typestr=type;
    
                if(arguments.length<=1)
                {
                    typestr=title;
                }
    
                let spider = new Spider2717(
                    url,
                    typestr,
                    1,
                    pagecount
                );
                spider.startSpider();
            }
    
        });
        //触发获取html内容
        get_html_by_request(url);
        //end way1
    
    
    }
    
    /**
     * 主调用
     * 只需要指定抓取图片首页url
     */
    /*
    性感红唇美女暗黑哥特风高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2019/314110.html
    清新浪漫的蓝天白云纯美风景图片高清壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313774.html
    世外桃源田园山水风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313773.html
    祖国山河壮丽的自然风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313772.html
    上帝视角俯瞰不一样的自然美景图片 https://www.2717.com/beautiful/zhuomianbeijing/2019/313771.html
    小巧可爱的七星瓢虫动物图片壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2019/313769.html
    雨后如珠似玉的花卉水珠梦幻特写图壁纸片 https://www.2717.com/beautiful/zhuomianbeijing/2019/313768.html
    神奇瑰丽的西藏圣象天门风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313767.html
    大自然雄伟雪山美景高清壁纸图片素材 https://www.2717.com/beautiful/zhuomianbeijing/2018/313723.html
    唯美图文手机背景高清壁纸图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313722.html
    甜美可爱的冬日圣诞女孩手机高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313721.html
    联想桌面壁纸高清图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313635.html
    香港乐坛天后容祖儿图片桌面壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313634.html
    刘德华主演电影高清桌面壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313608.html
    美女明星杨蓉白色吊带性感连衣裙高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313590.html
    死侍双刀耍酷高清壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313572.html
    马思纯露肩性感写真高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313571.html
    温馨幸福的韩系情侣高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313558.html
    韩国女神美女IU拼接图片大全分享 https://www.2717.com/beautiful/zhuomianbeijing/2018/313557.html
    你和我的倾城时光金瀚高清剧照图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313556.html
    李易峰高清手机壁纸图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313555.html
    最新超级可爱的萌娃拼接图片大全 https://www.2717.com/beautiful/zhuomianbeijing/2018/313552.html
    偶像练习生陈立农高清壁纸写真图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313532.html
    白敬亭帅气时尚高清壁纸写真图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313531.html
    悲伤逆流成河顾森湘高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313517.html
    可盐可甜的爱豆高清锁屏壁纸图片大全 https://www.2717.com/beautiful/zhuomianbeijing/2018/313505.html
    2016年1月日历精选清新护眼壁纸图片5下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313494.html
    奔驰梅赛德斯SLK55汽车壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313489.html
    延禧攻略 清宫浮世绘版海报壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313487.html
    海洋世界里的动物蓝色图片桌面壁纸1下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313485.html
    OL制服美女美腿丝袜性感图片桌面壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313470.html
    飞檐走壁的美女个性壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313466.html
    
    */
    let url='';
     // url = "https://www.2717.com/ent/meinvtupian/2019/316305.html";
    // let url = 'https://www.2717.com/beautiful/zhuomianbeijing/2019/314110.html';
    //url='https://lq.2717.com/kbtp/2018/313409.html';
    //url='https://lq.2717.com/kbtp/2017/184385.html';
    url='https://www.2717.com/beautiful/qichetuku/2015/17388.html';
    // url='https://www.2717.com/beautiful/zhuomianbeijing/2018/313450.html';
    
    let arguments = process.argv.splice(2);
    if(arguments.length>0)
    {
            url=arguments[0];
    }
    let type='美女图片';
    if(arguments.length>1)
    {
        type=arguments[1];
    }
    main(url,type);
    
    
    
     
    本次本来想继承events的事件驱动类来写爬虫的,经过测试死活不行,后来只有使用外部events实列的on,emit方法才通过,但是如下测试代码通过继承events又可以
    
    
    let EventsDemo = require('events');
    
    
    class MyEvents extends EventsDemo {
        constructor() {
            super();
        }
    
        callA() {
            console.log("call A");
            this.emit("aaa", "a",123);
        }
    
        callB() {
            console.log("call B");
            this.emit('bbb', 'b',123,456);
        }
    
        start(){
            // let   myevent = new MyEvents();
    
    
            this.on("test", (p1, p2, p3) => {
                let msg = '';
                //msg="p1={$p1},p2={$p2},p3={$p3}";
                msg = "p1=" + p1 + "," + "p2=" + p2 + "," + "p3=" + p3;
    
                console.log(msg);
            });
            this.emit("test", 1, "abc", 3.1415926);
    
            console.log("==================================================");
            // myevent = new MyEvents();
    
            this.on("aaa",(p1,p2)=>{
                let msg = '';
                msg = "callA:"+"p1=" + p1 + "," + "p2=" + p2 ;
                console.log(msg);
            });
    
            this.on('bbb', (p1,p2,p3)=>{
                let msg = '';
                msg = "callB:"+"p1=" + p1 + "," + "p2=" + p2 + "," + "p3=" + p3;
                console.log(msg);
            });
    
            this.callA();
            this.callB();
        }
    };
    
    
    
    /**
     * 主函数
     */
    //main();
    
    myevent = new MyEvents();
    myevent.start();
    
    

    这个问题有点诡异,知道的朋友请指教,谢谢。




     
  • 相关阅读:
    linux下的第一个C程序及其编译方法
    使用open_read_write等底层函数来赋值一个文件
    C++中预定义的宏
    altibase MDB的创建sequence的举例
    C中的时间函数的用法
    联系表单 1
    《jQuery基础教程》读书笔记
    《jQuery基础教程》读书笔记
    《jQuery基础教程》读书笔记
    『原创·翻译』如何阅读论文
  • 原文地址:https://www.cnblogs.com/it-tsz/p/12014451.html
Copyright © 2011-2022 走看看