zoukankan      html  css  js  c++  java
  • shell实践--简单抓取网页内容

    #!/bin/bash

    base_path="https://testerhome.com/"
    user_path="ycwdaaaa/topics?page="
    rm suffix*
    rm -f ./htmldoc/*
    for i in $(seq 1 5)
    do
    user_page=${base_path}${user_path}${i}
    echo ${user_page}
    #echo "-----------------------------------"
    curl ${user_page} 2>/dev/null | grep -E 'href="(/articles|/topics)/[0-9]{5}' | awk -F """ '{print $4$5}' | sort | awk -F ">|<" '{print $1 "@" $2}' >> ./suffix_path.txt
    done

    sed 's/[[:space:]]//g' ./suffix_path.txt > ./suffix_path_name.txt

    for j in `cat ./suffix_path_name.txt`
    do
    echo "文件行内容:${j}"
    arr_0=`echo ${j} | cut -d"@" -f1`
    arr_1=`echo ${j} | cut -d"@" -f2`
    echo "=================================="
    echo "arr[0] is: ${arr_0}"
    echo "arr[1] is: ${arr_1}"
    topic_path=${base_path}${arr_0}
    echo "topic_path is: ${topic_path}"
    curl ${topic_path} 2>/dev/null > ./htmldoc/${arr_1}.html
    done

    ---------------------------------------------------------------------------------------

    #!/bin/bash

    base_path="https://testerhome.com/"
    user_path="ycwdaaaa/topics?page="
    rm suffix*
    rm -f ./htmldoc/*
    for i in $(seq 1 5)
    do
    user_page=${base_path}${user_path}${i}
    echo ${user_page}
    #echo "-----------------------------------"
    curl ${user_page} 2>/dev/null | grep -E 'href="(/articles|/topics)/[0-9]{5}' | awk -F """ '{print $4$5}' | sort | awk -F ">|<" '{print $1 "@" $2}' >> ./suffix_path.txt
    done

    sed 's/[[:space:]]//g' ./suffix_path.txt > ./suffix_path_name.txt

    for j in `cat ./suffix_path_name.txt`
    do
    echo "文件行内容:${j}"
    OLD_IFS=${IFS}
    IFS="@"
    arr=(${j})
    arr_0=${arr[0]}
    arr_1=${arr[1]}
    echo "=================================="
    echo "arr[0] is: ${arr_0}"
    echo "arr[1] is: ${arr_1}"
    topic_path=${base_path}${arr_0}
    echo "topic_path is: ${topic_path}"
    curl ${topic_path} 2>/dev/null > ./htmldoc/${arr_1}.html
    done
    IFS=${OLD_IFS}

  • 相关阅读:
    C++学习笔记27,虚函数作品
    HDU
    POJ 2524 Ubiquitous Religions
    HDU-3839-Ancient Messages(DFS)
    thinkphp 删除所有缓存 Rumtime 以及 Html 静态缓存
    [AngularJS] Design Pattern: Simple Mediator
    [Javascript] Add a browser build to an npm module
    [Angular 2] ngrx/store
    [Typescript] Introduction to Generics in Typescript
    [AngularJS] angular-md-table for Angular material design
  • 原文地址:https://www.cnblogs.com/fqfanqi/p/11751582.html
Copyright © 2011-2022 走看看