zoukankan      html  css  js  c++  java
  • shell实践--简单抓取网页内容

    #!/bin/bash

    base_path="https://testerhome.com/"
    user_path="ycwdaaaa/topics?page="
    rm suffix*
    rm -f ./htmldoc/*
    for i in $(seq 1 5)
    do
    user_page=${base_path}${user_path}${i}
    echo ${user_page}
    #echo "-----------------------------------"
    curl ${user_page} 2>/dev/null | grep -E 'href="(/articles|/topics)/[0-9]{5}' | awk -F """ '{print $4$5}' | sort | awk -F ">|<" '{print $1 "@" $2}' >> ./suffix_path.txt
    done

    sed 's/[[:space:]]//g' ./suffix_path.txt > ./suffix_path_name.txt

    for j in `cat ./suffix_path_name.txt`
    do
    echo "文件行内容:${j}"
    arr_0=`echo ${j} | cut -d"@" -f1`
    arr_1=`echo ${j} | cut -d"@" -f2`
    echo "=================================="
    echo "arr[0] is: ${arr_0}"
    echo "arr[1] is: ${arr_1}"
    topic_path=${base_path}${arr_0}
    echo "topic_path is: ${topic_path}"
    curl ${topic_path} 2>/dev/null > ./htmldoc/${arr_1}.html
    done

    ---------------------------------------------------------------------------------------

    #!/bin/bash

    base_path="https://testerhome.com/"
    user_path="ycwdaaaa/topics?page="
    rm suffix*
    rm -f ./htmldoc/*
    for i in $(seq 1 5)
    do
    user_page=${base_path}${user_path}${i}
    echo ${user_page}
    #echo "-----------------------------------"
    curl ${user_page} 2>/dev/null | grep -E 'href="(/articles|/topics)/[0-9]{5}' | awk -F """ '{print $4$5}' | sort | awk -F ">|<" '{print $1 "@" $2}' >> ./suffix_path.txt
    done

    sed 's/[[:space:]]//g' ./suffix_path.txt > ./suffix_path_name.txt

    for j in `cat ./suffix_path_name.txt`
    do
    echo "文件行内容:${j}"
    OLD_IFS=${IFS}
    IFS="@"
    arr=(${j})
    arr_0=${arr[0]}
    arr_1=${arr[1]}
    echo "=================================="
    echo "arr[0] is: ${arr_0}"
    echo "arr[1] is: ${arr_1}"
    topic_path=${base_path}${arr_0}
    echo "topic_path is: ${topic_path}"
    curl ${topic_path} 2>/dev/null > ./htmldoc/${arr_1}.html
    done
    IFS=${OLD_IFS}

  • 相关阅读:
    hdu1686 最大匹配次数 KMP
    洛谷 P5057 [CQOI2006]简单题(树状数组)
    洛谷 P5020 货币系统
    洛谷 P5019 铺设道路(差分)
    洛谷 P1119 灾后重建(Floyd)
    洛谷 P1082 同余方程(同余&&exgcd)
    洛谷 P2384 最短路
    洛谷 P3371 【模板】单源最短路径(弱化版) && dijkstra模板
    洛谷 P1387 最大正方形
    洛谷 P2866 [USACO06NOV]糟糕的一天Bad Hair Day
  • 原文地址:https://www.cnblogs.com/fqfanqi/p/11751582.html
Copyright © 2011-2022 走看看