zoukankan      html  css  js  c++  java
  • nodejs实现抓取图片的爬虫脚本--crawler.js

    仅做了必要的注释,我太懒了。目前只是一个雏形,实现基本的需求。有时间会修改的完善一些。

    /*
     * @Author: jiahaiLiu
     * @Date:   2017-07-17 10:44:03
     * @Last Modified by:   jiahaiLiu
     * @Last Modified time: 2017-07-17 18:53:48
     * @Usage: node crawler [100]
     */
    
    'use strict';
    
    /*
     * Request is designed to be the simplest way possible to make http calls. 
     * It supports HTTPS and follows redirects by default.
     */
    const request = require('request');
    // cheerio是为服务器特别定制的,快速、灵活、实施的jQuery核心实现.
    const cheerio = require('cheerio');
    /*
     * Async is a utility module which provides straight-forward,
     * powerful functions for working with asynchronous JavaScript.
     */
    const async = require('async');
    const path = require('path');
    const fs = require('fs');
    const url = require('url');
    // 自定义下载图片数量,默认为100
    let targetAmount = process.argv.splice(2)[0] || 100;
    /*let getLink = 'http://image.so.com/j?q=%E7%BE%8E%E5%A5%B3&src=srp&correct=%E7%BE%8E%E5%A5%B3&sn=61&pn=60&sid=7e73fad3c0eb8367ede610dcf2784c0e&ran=0&ras=0';*/
    // 定义存储图片的文件夹名称
    let collect_pic_dir = './collect_pic/';
    
    let imgList = []; // 图片链接集合
    let dest,
        start = 0;
    let t1 = new Date().getTime();
    let urlObj = {
        protocol: 'http:',
        slashes: true,
        auth: null,
        host: 'image.so.com',
        port: null,
        hostname: 'image.so.com',
        hash: null,
        query: {
            q: '美女',
            src: 'srp',
            correct: '美女',
            sn: '0',
            pn: '60',
            sid: '7e73fad3c0eb8367ede610dcf2784c0e',
            ran: '0',
            ras: '0'
        },
        pathname: '/j',
    };
    let urlLink,
        resObj;
    
    
    
    if (!fs.existsSync(collect_pic_dir)) {
        fs.mkdirSync(collect_pic_dir);
        console.log('The ' + collect_pic_dir + ' folder has been created!');
    }
    
    loop(asyncDownload);
    
    function loop(cb) {
        urlLink = url.format(urlObj);
        console.log(urlLink);
        request(urlLink, function(err, res, body) {
            if (!err && res.statusCode === 200) {
                /*const $ = cheerio.load(body);
                JSON.parse($('script[id="initData"]').html()).list.forEach(function(item) {
                    imgList.push(item.img)
                });*/
                resObj = JSON.parse(res.body);
                /* resObj example
                            {
                                total: 1500,
                                end: false,
                                sid: "6b57a007f19740b44d562f6e0ec6e050",
                                ran: 0,
                                ras: 0,
                                lastindex: 121,
                                ceg: 181011782,
                                list: [{
                                    id: "7697671c2932936c55a39fd2e4d30ceb",
                                    qqface_down_url: false,
                                    downurl: false,
                                    grpmd5: false,
                                    type: 0,
                                    src: "1",
                                    index: 61,
                                    title: "<em>美女</em>诱惑_peaceful",
                                    litetitle: "",
                                     "1000",
                                    height: "1504",
                                    imgsize: "225KB",
                                    imgtype: "JPEG",
                                    key: "7913541bc5",
                                    dspurl: "blog.sina.com.cn",
                                    link: "http://blog.sina.com.cn/s/blog_a5bc8202010109ta.html",
                                    source: 2,
                                    img: "http://img165.poco.cn/mypoco/myphoto/20111030/05/54704062201110300502223689419360167_010.jpg",
                                    thumb_bak: "http://p0.so.qhmsg.com/t01da6596eb67097425.jpg",
                                    thumb: "http://p0.so.qhmsg.com/t01da6596eb67097425.jpg",
                                    _thumb_bak: "http://p0.so.qhmsg.com/sdr/_240_/t01da6596eb67097425.jpg",
                                    _thumb: "http://p0.so.qhmsg.com/sdr/_240_/t01da6596eb67097425.jpg",
                                    thumbWidth: 160,
                                    dsptime: "",
                                    thumbHeight: 240,
                                    grpcnt: "8",
                                    fixedSize: false
                                }],
                                boxresult: null,
                                wordguess: null
                        }
             */
                resObj.list.forEach(function(item) {
                    imgList.push(item.img);
                });
    
                if (imgList.length >= targetAmount) {
                    cb();
                } else {
                    if (!resObj.end) {
                        urlObj.query.sn = resObj.lastindex + 1;
                        urlObj.query.sid = resObj.sid;
                        loop(cb);
                    } else {
                        console.log('no more datas from source url');
                    }
                }
            }
        });
    }
    
    // download picture
    function asyncDownload() {
        console.log('图片总数:', imgList.length);
        async.mapSeries(imgList, function(item, callback) {
                setTimeout(function() {
                    downloadPic(item, collect_pic_dir + start + '.jpg');
                    callback(null, item);
                    start++;
                }, 400);
            }, function(err, results) {
                let t2 = new Date().getTime();
                console.log('全部完成,总耗时:', (t2 - t1) + 'ms';
                });
    
        }
    
        function downloadPic(src, dest) {
            request
                .get(src)
                .on('response', function(response) {
                    // console.log (response);
                    // console.log(response.statusCode) // 200 
                    // console.log(response.headers['content-type']) // 'image/png' 
                })
                .on('error', function(err) {
                    console.log(err)
                })
                .pipe(fs.createWriteStream(dest));
        }
  • 相关阅读:
    jbpm4.4使用的hibernate3如何兼容spring5.x及异常Caused by: java.lang.ClassNotFoundException: org.hibernate.impl.SessionImpl
    Caused by: java.lang.ClassNotFoundException: io.netty.resolver.AddressResolverGroup
    Caused by: java.lang.ClassNotFoundException: org.jboss.marshalling.ClassResolver
    Caused by: java.lang.ClassNotFoundException: com.fasterxml.jackson.dataformat.yaml.YAMLFactory
    Redisson报错Caused by: java.lang.IllegalArgumentException: RIVER
    redis中StringRedisTemplate的setIfAbsent方法设置过期时间
    xshell下载
    mysql下载地址
    最小化可行产品MVP
    电梯演讲
  • 原文地址:https://www.cnblogs.com/xiaohaifengke/p/7698913.html
Copyright © 2011-2022 走看看