把以前博客的东西夜迁移过来吧,这个是以前公司做的,原来放在csdn里面,先切过来。
用多进程实现的 wget多进程抓取的实现,有问题联系我 (微博:偶是周亮)
#!/bin/bash url_path="-"; WGET_TIMECOUNT=2; WGET_TIME=10; FORK_SLEEP_TIME=1; ONEURL_SLEEP_TIME=1; SPIDER_PID_NUM=6; function usage(){ echo "usage:spider.sh -u url_path -d page_store_dir"; exit 3; } function version(){ echo "same-source-tools-spider-1.0.0"; exit 4; } while getopts l:u:d:t:T:s:S:p: OPTION do case $OPTION in u)url_path=${OPTARG};; d)spider_dir=${OPTARG};; t)WGET_TIMECOUNT=${OPTARG};; T)WGET_TIME=${OPTARG};; s)FORK_SLEEP_TIME=${OPTARG};; S)ONEURL_SLEEP_TIME=${OPTARG};; p)SPIDER_PID_NUM=${OPTARG};; l)LOG_PATH=${OPTARG};; h)usage;; v)version;; /?)usage;; esac done touch ${LOG_PATH}; #检查抓取文件是否存在 if [ -e ${url_path} ]; then echo "spider test: ${url_path} is exist" ; else echo "url_path spider test: ${url_path} is not exist"; exit 1; fi #检查存储网页的目录是否存在 if [ -e ${spider_dir} ]; then echo "spider test: ${spider_dir} is exist" ; else echo "spider_dir spider test: ${spider_dir} is not exist"; exit 2; fi #清除原来的url文件 url_first_path="${spider_dir}/url_0"; if [ -e ${url_first_path} ]; then rm ${spider_dir}/url_*; fi #创建url多进程抓取文件 for ((i=0;i<${SPIDER_PID_NUM};i++));do { touch ${spider_dir}/url_${i}; } done no=0; #向url多进程抓取文件中写入抓取的url cat ${url_path} | while read line do echo $line >> ${spider_dir}/url_${no}; no=$(($no+1)); if [ $no -ge ${SPIDER_PID_NUM} ]; then no=0; fi; done #开始多进程抓取 for ((i=0;i<${SPIDER_PID_NUM};i++));do sleep ${FORK_SLEEP_TIME}; { url_path="${spider_dir}/url_${i}"; if [ -e $url_path ]; then cat ${url_path} | / while read url do sleep ${ONEURL_SLEEP_TIME}; url_md5=`echo ${url} | md5sum | awk -F" " '{print $1}'`; wget "${url}" -o ${LOG_PATH}_${url_md5} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME}; #wget ${url} -o ${LOG_PATH}_${url_md5} -a ${LOG_PATH} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME}; dateFlag=`date +"%Y%m%d-%H:%M:%S"`; if [ $? -eq 0 ]; then echo "${dateFlag} NOTICE:spiderwgetsuccess ${url}" ; else echo "${dateFlag} ERROR:spiderwgeterror ${url}" ; rm ${spider_dir}/${url_md5}; fi done else continue; fi } & done wait