zoukankan      html  css  js  c++  java
  • Linux Shell 网页抓取

     args.txt
    #! /bin/bash 
    if [ -z $1 ] || [ ! -e $1 ]  
    then
    	echo "Usage: cmd.sh input "
    	exit
    fi
    
    
    echo $0
    for num in $*;do 
        echo "$num"
    done 
    for i in $(seq -3 $#); 
        do   
            echo $i 
        done 
    for i in {0..5} 
    do 
        echo $i 
    done 
    echo $@ 
      
    for((i=4;i<7;i++));do 
    echo $i 
    done 
      
    echo "all:$$"
      
    trimReg="s/\(^ *\)\(.*[^ ]\)\( *$\)/\2/"  
    tmpfile=`cat /proc/sys/kernel/random/uuid` 
      
      
    while read line; 
    do 
        value=${line#*=} 
        key=${line%%=*} 
        key=`echo ${key}|sed -e "${trimReg}"` 
        value=`echo ${value}|sed -e "${trimReg}"` 
        	if [ "$key" == "url" ] 
       	 then     
            		url=$value 
        	elif [ "$key" == "beginwith" ] 
        	then 
                beginwith=$value 
        	elif [ "$key" == "endwith" ] 
        	then  
                endwith=$value 
        	elif [ "$key" == "pagereg" ] 
        	then  
                pagereg=$value 
        	elif [ "$key" == "savepath" ] 
        	then 
                savepath=$value 
    	 	elif [ "$key" == "prefix" ]
    	 	then  
    			 	prefix=$value
    		elif [ "$key" == "proxy" ]
    		then	proxy=$value
        	fi 
      
    done < $1
      
    echo "url:$url"
    echo "beginwith:$beginwith"
    echo "pagereg:$pagereg"
    echo "endwith:$endwith"
    echo "prefix:$prefix"
    echo "proxy:$proxy"
    echo "savepath:$savepath"
    echo "tmpfile:$tmpfile"
    if [ -z $proxy ]
    then
    content=`curl -s $url | iconv -f gbk -t utf-8`
    else
    content=`curl -x $proxy -s $url | iconv -f gbk -t utf-8`
    fi 
    length=`expr length "${content}"` 
    echo "download:$length byte(s)"
    content=${content#*${beginwith}} 
    content=${content%%${endwith}*}
    length=`expr length "${content}"` 
    echo "after filer:$length byte(s)"
    
    echo $content|grep -Po "$pagereg"|uniq > $savepath
    
    awk '{a[$0]++}END{for(m in a) print m}' $savepath > $tmpfile
    
    
    if [ ! -z $prefix ]
    then
    	sed "s/^/$prefix/g" $tmpfile > $savepath
    else
    	cp $tmpfile $savepath
    fi
    
    
    rm -f $tmpfile
    
      
    str="0000012345456789000000"
    echo $str
    #str= expr substr $str 1 2 
    #str=${str:2:3} 
    str=${str#*0} 
    echo $str
    #trim the string 
    str="  s =  "
    str=`echo $str | sed -e "${trimReg}"`  
    echo [$str] 
    echo $str | sed -e "${trimReg}"
    
    url = focus.news.163.com
    beginwith = <ul class="focuslist-1" id="focusTab-1">
    pagereg = (?<=href=\\")http://focus\\.news\\.163\\.com/[\\d]+.+?(?=\\")
    endwith =  <div class="con-4" area clearfix">
    savepath = 163.txt
    

    躲猫猫社团团长 http://t.sina.com.cn/coolria

  • 相关阅读:
    Windows环境安装tesseract-ocr 4.00并配置环境变量
    python问题集
    使用CefSharp在.Net程序中嵌入Chrome浏览器(八)——Cookie
    python虛擬環境和工具
    pycharm使用(持续更新)
    醒过来的都市
    下一个十年计划6-作品
    下一个十年计划5-方向选择
    下一个十年计划4-反向选择
    负逻辑的实际应用
  • 原文地址:https://www.cnblogs.com/yangyh/p/shell.html
Copyright © 2011-2022 走看看