zoukankan      html  css  js  c++  java
  • wget多进程抓取的实现

    把以前博客的东西夜迁移过来吧,这个是以前公司做的,原来放在csdn里面,先切过来。

    用多进程实现的 wget多进程抓取的实现,有问题联系我 (微博:偶是周亮)

    #!/bin/bash
    url_path="-";
    WGET_TIMECOUNT=2;
    WGET_TIME=10;
    FORK_SLEEP_TIME=1;
    ONEURL_SLEEP_TIME=1;
    SPIDER_PID_NUM=6;
    function usage(){
            echo "usage:spider.sh -u url_path -d page_store_dir";
            exit 3;
    }
    function version(){
            echo "same-source-tools-spider-1.0.0";
            exit 4;
    }
    while getopts l:u:d:t:T:s:S:p: OPTION
    do
            case $OPTION
                    in
                    u)url_path=${OPTARG};;
                    d)spider_dir=${OPTARG};;
                    t)WGET_TIMECOUNT=${OPTARG};;
                    T)WGET_TIME=${OPTARG};;
                    s)FORK_SLEEP_TIME=${OPTARG};;
                    S)ONEURL_SLEEP_TIME=${OPTARG};;
                    p)SPIDER_PID_NUM=${OPTARG};;
                    l)LOG_PATH=${OPTARG};;
                    h)usage;;
                    v)version;;
                    /?)usage;;
            esac
    done
    touch ${LOG_PATH};
    #检查抓取文件是否存在
    if [ -e ${url_path} ]; then
            echo "spider test: ${url_path} is exist" ;
    else
            echo "url_path spider test: ${url_path} is not exist";
            exit 1;
    fi
    #检查存储网页的目录是否存在
    if [ -e ${spider_dir} ]; then
            echo "spider test: ${spider_dir} is exist" ;
    else
            echo "spider_dir spider test: ${spider_dir} is not  exist";
            exit 2;
    fi
    #清除原来的url文件
    url_first_path="${spider_dir}/url_0";
    if [ -e ${url_first_path} ]; then
            rm ${spider_dir}/url_*;
    fi
    #创建url多进程抓取文件
    for ((i=0;i<${SPIDER_PID_NUM};i++));do
    {
            touch ${spider_dir}/url_${i};    
    }
    done
    no=0;
    #向url多进程抓取文件中写入抓取的url
    cat ${url_path} | while read line
    do
            echo $line >> ${spider_dir}/url_${no};
            no=$(($no+1));
            if [ $no -ge ${SPIDER_PID_NUM} ]; then
                    no=0;
            fi;
    done
    #开始多进程抓取
    for ((i=0;i<${SPIDER_PID_NUM};i++));do
    sleep ${FORK_SLEEP_TIME};
    {
            url_path="${spider_dir}/url_${i}";
            if [ -e $url_path ]; then
                    cat ${url_path} | /
                    while read url 
                    do
                            sleep ${ONEURL_SLEEP_TIME};
                            url_md5=`echo ${url} | md5sum | awk -F" " '{print $1}'`;
                            wget "${url}" -o ${LOG_PATH}_${url_md5} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};
                            #wget ${url} -o ${LOG_PATH}_${url_md5} -a ${LOG_PATH} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};
                            dateFlag=`date +"%Y%m%d-%H:%M:%S"`;
                            if [ $? -eq 0 ]; then
                                    echo "${dateFlag} NOTICE:spiderwgetsuccess ${url}" ; 
                            else
                                    echo "${dateFlag} ERROR:spiderwgeterror ${url}" ; 
                                    rm ${spider_dir}/${url_md5};
                            fi
                    done
            else
                    continue;
            fi
    } &
    done
    wait
  • 相关阅读:
    php设计模式 -- 数据映射模式
    php 守护进程 (简单)
    php 守护进程
    php rabbitmq demo
    linux 全局安装composer
    linux 安装rabbitmq
    linux php安装RabbitMq扩展
    http和tcp详解
    lnmp环境脚本自动配置
    30.输入年月日,判断它是该年的第多少天
  • 原文地址:https://www.cnblogs.com/wully/p/3341302.html
Copyright © 2011-2022 走看看