zoukankan      html  css  js  c++  java
  • Linux Shell 网页抓取

     args.txt
    #! /bin/bash 
    if [ -z $1 ] || [ ! -e $1 ]  
    then
    	echo "Usage: cmd.sh input "
    	exit
    fi
    
    
    echo $0
    for num in $*;do 
        echo "$num"
    done 
    for i in $(seq -3 $#); 
        do   
            echo $i 
        done 
    for i in {0..5} 
    do 
        echo $i 
    done 
    echo $@ 
      
    for((i=4;i<7;i++));do 
    echo $i 
    done 
      
    echo "all:$$"
      
    trimReg="s/\(^ *\)\(.*[^ ]\)\( *$\)/\2/"  
    tmpfile=`cat /proc/sys/kernel/random/uuid` 
      
      
    while read line; 
    do 
        value=${line#*=} 
        key=${line%%=*} 
        key=`echo ${key}|sed -e "${trimReg}"` 
        value=`echo ${value}|sed -e "${trimReg}"` 
        	if [ "$key" == "url" ] 
       	 then     
            		url=$value 
        	elif [ "$key" == "beginwith" ] 
        	then 
                beginwith=$value 
        	elif [ "$key" == "endwith" ] 
        	then  
                endwith=$value 
        	elif [ "$key" == "pagereg" ] 
        	then  
                pagereg=$value 
        	elif [ "$key" == "savepath" ] 
        	then 
                savepath=$value 
    	 	elif [ "$key" == "prefix" ]
    	 	then  
    			 	prefix=$value
    		elif [ "$key" == "proxy" ]
    		then	proxy=$value
        	fi 
      
    done < $1
      
    echo "url:$url"
    echo "beginwith:$beginwith"
    echo "pagereg:$pagereg"
    echo "endwith:$endwith"
    echo "prefix:$prefix"
    echo "proxy:$proxy"
    echo "savepath:$savepath"
    echo "tmpfile:$tmpfile"
    if [ -z $proxy ]
    then
    content=`curl -s $url | iconv -f gbk -t utf-8`
    else
    content=`curl -x $proxy -s $url | iconv -f gbk -t utf-8`
    fi 
    length=`expr length "${content}"` 
    echo "download:$length byte(s)"
    content=${content#*${beginwith}} 
    content=${content%%${endwith}*}
    length=`expr length "${content}"` 
    echo "after filer:$length byte(s)"
    
    echo $content|grep -Po "$pagereg"|uniq > $savepath
    
    awk '{a[$0]++}END{for(m in a) print m}' $savepath > $tmpfile
    
    
    if [ ! -z $prefix ]
    then
    	sed "s/^/$prefix/g" $tmpfile > $savepath
    else
    	cp $tmpfile $savepath
    fi
    
    
    rm -f $tmpfile
    
      
    str="0000012345456789000000"
    echo $str
    #str= expr substr $str 1 2 
    #str=${str:2:3} 
    str=${str#*0} 
    echo $str
    #trim the string 
    str="  s =  "
    str=`echo $str | sed -e "${trimReg}"`  
    echo [$str] 
    echo $str | sed -e "${trimReg}"
    
    url = focus.news.163.com
    beginwith = <ul class="focuslist-1" id="focusTab-1">
    pagereg = (?<=href=\\")http://focus\\.news\\.163\\.com/[\\d]+.+?(?=\\")
    endwith =  <div class="con-4" area clearfix">
    savepath = 163.txt
    

    躲猫猫社团团长 http://t.sina.com.cn/coolria

  • 相关阅读:
    NSDateFormatter格式详细列表一览
    Core Data could not fulfill a fault
    使用Devstack部署neutron网络节点
    配置基于Devstack的嵌套KVM虚拟化
    配置基于Devstack的嵌套KVM虚拟化
    Devstack单节点环境实战配置
    Devstack单节点环境实战配置
    Openstack贡献者须知 2 — 社区工作运作 & 代码贡献流程
    Openstack贡献者须知 2 — 社区工作运作 & 代码贡献流程
    Openstack 中的消息总线 & AMQP
  • 原文地址:https://www.cnblogs.com/yangyh/p/shell.html
Copyright © 2011-2022 走看看