Linux Shell 网页抓取

 args.txt

#! /bin/bash 
if [ -z $1 ] || [ ! -e $1 ]  
then
	echo "Usage: cmd.sh input "
	exit
fi


echo $0
for num in $*;do 
    echo "$num"
done 
for i in $(seq -3 $#); 
    do   
        echo $i 
    done 
for i in {0..5} 
do 
    echo $i 
done 
echo $@ 
  
for((i=4;i<7;i++));do 
echo $i 
done 
  
echo "all:$$"
  
trimReg="s/\(^ *\)\(.*[^ ]\)\( *$\)/\2/"  
tmpfile=`cat /proc/sys/kernel/random/uuid` 
  
  
while read line; 
do 
    value=${line#*=} 
    key=${line%%=*} 
    key=`echo ${key}|sed -e "${trimReg}"` 
    value=`echo ${value}|sed -e "${trimReg}"` 
    	if [ "$key" == "url" ] 
   	 then     
        		url=$value 
    	elif [ "$key" == "beginwith" ] 
    	then 
            beginwith=$value 
    	elif [ "$key" == "endwith" ] 
    	then  
            endwith=$value 
    	elif [ "$key" == "pagereg" ] 
    	then  
            pagereg=$value 
    	elif [ "$key" == "savepath" ] 
    	then 
            savepath=$value 
	 	elif [ "$key" == "prefix" ]
	 	then  
			 	prefix=$value
		elif [ "$key" == "proxy" ]
		then	proxy=$value
    	fi 
  
done < $1
  
echo "url:$url"
echo "beginwith:$beginwith"
echo "pagereg:$pagereg"
echo "endwith:$endwith"
echo "prefix:$prefix"
echo "proxy:$proxy"
echo "savepath:$savepath"
echo "tmpfile:$tmpfile"
if [ -z $proxy ]
then
content=`curl -s $url | iconv -f gbk -t utf-8`
else
content=`curl -x $proxy -s $url | iconv -f gbk -t utf-8`
fi 
length=`expr length "${content}"` 
echo "download:$length byte(s)"
content=${content#*${beginwith}} 
content=${content%%${endwith}*}
length=`expr length "${content}"` 
echo "after filer:$length byte(s)"

echo $content|grep -Po "$pagereg"|uniq > $savepath

awk '{a[$0]++}END{for(m in a) print m}' $savepath > $tmpfile


if [ ! -z $prefix ]
then
	sed "s/^/$prefix/g" $tmpfile > $savepath
else
	cp $tmpfile $savepath
fi


rm -f $tmpfile

  
str="0000012345456789000000"
echo $str
#str= expr substr $str 1 2 
#str=${str:2:3} 
str=${str#*0} 
echo $str
#trim the string 
str="  s =  "
str=`echo $str | sed -e "${trimReg}"`  
echo [$str] 
echo $str | sed -e "${trimReg}"

url = focus.news.163.com
beginwith = <ul class="focuslist-1" id="focusTab-1">
pagereg = (?<=href=\\")http://focus\\.news\\.163\\.com/[\\d]+.+?(?=\\")
endwith =  <div class="con-4" area clearfix">
savepath = 163.txt

躲猫猫社团团长 http://t.sina.com.cn/coolria

查看全文

相关阅读:
How to: Implement a View Item 如何：实现视图项
 How to:Create a New Object using the Navigation Control 如何:使用导航控件创建新对象
 How to:Access the Transition Manager 如何:访问过渡管理器
 How to: Implement Custom Context Navigation 如何：实现自定义上下文导航
 How to: Access Master Detail View and Nested List View Environment 如何：访问主详细信息视图和嵌套列表视图环境
 How to: Display a Detail View Directly in Edit Mode in ASP.NET Applications 如何：在ASP.NET应用程序中直接在编辑模式下显示详细信息视图
 How to: Detect a Lookup List View in Code 如何：在代码中检测查找列表视图
 How to: Create and Show a Detail View of the Selected Object in a Popup Window 如何：在弹出窗口中创建和显示选定对象的详细信息视图
 How to: Access Objects Selected in the Current View 如何：访问在当前视图中选择的对象
 How to: Display a List View as a Pivot Grid Table and Chart如何：将列表视图显示为数据透视网格表和图表

原文地址：https://www.cnblogs.com/yangyh/p/shell.html