zoukankan      html  css  js  c++  java
  • 纯golang爬虫实战(一)

    纯golang爬取内网网站数据

    参考https://blog.csdn.net/CrazyJavaPerson/article/details/81871649

    难点一:需要登录,参考我的上一篇文章https://www.cnblogs.com/pu369/p/12201707.html,找到POST登录的url和Form Data,并且经测试可改用GET方式登录。

    难点二:登录后访问时携带cookie.参考https://blog.csdn.net/liu_rong_fei/article/details/51820793  用resp.Cookies()可得到Cookies,还有https://www.cnblogs.com/cnsanshao/p/7084808.html,最后参考https://segmentfault.com/q/1010000005889328这里,决定还是用cookiejar和http.Client,这样后续请求client会自动将cookie加入

    难点三:我只想保留文字,参考https://studygolang.com/articles/9360去掉html标签

    上代码(密码用XX代替了,根据网站特点,主要是URL参数变化且是连续数字,只需写个循环访问即可)

    //纯golang爬虫
    package main
    
    import (
        "fmt"
        "io/ioutil"
        "net/http"
        "net/http/cookiejar"
        "regexp"
        "strings"
    )
    
    type MySpider struct {
        indexUrl string
        cleint   *http.Client
    }
    
    //登录,用GET代替POST请求
    func (this MySpider) login() (string, error) {
        resp, err := this.cleint.Get("http://192.168.13.1:8080/")
        defer resp.Body.Close()
        resp, err = this.cleint.Get("http://192.168.13.1:8080/login/auth?name=XX&password=XX&scurity=s&type=0&typeField=0")
        body, err := ioutil.ReadAll(resp.Body)
        fmt.Print(string(body))
        resp, err = this.cleint.Get("http://192.168.13.1:8080/browse/basicinfo_p.jsp?rtpage=psnfrm&pid=00000164&func=0297&userbase=Usr")
        body, err = ioutil.ReadAll(resp.Body)
        fmt.Print(string(body))
        trimbody := []byte(trimHtml(string(body)))
        //保存到文件
        err = ioutil.WriteFile("test.txt", trimbody, 0644)
    
        resp, err = http.Get(this.indexUrl)
        if err != nil {
            return "err", err
        }
        defer resp.Body.Close()
        body, err = ioutil.ReadAll(resp.Body)
        //fmt.Print(resp.Cookies())
        if err != nil {
            return "err", err
        }
        return string(body), err
    }
    
    //运行
    func (this MySpider) run() string {
        //生成可复用的client
        var client http.Client
        jar, err := cookiejar.New(nil)
        if err != nil {
            panic(err)
        }
        client.Jar = jar
        this.cleint = &client
        //登录,用GET代替POST请求
        this.login()
        return ""
    }
    func trimHtml(src string) string {
        //将HTML标签全转换成小写
        re, _ := regexp.Compile("\<[\S\s]+?\>")
        src = re.ReplaceAllStringFunc(src, strings.ToLower)
        //去除STYLE
        re, _ = regexp.Compile("\<style[\S\s]+?\</style\>")
        src = re.ReplaceAllString(src, "")
        //去除SCRIPT
        re, _ = regexp.Compile("\<script[\S\s]+?\</script\>")
        src = re.ReplaceAllString(src, "")
        //去除所有尖括号内的HTML代码,并换成换行符
        re, _ = regexp.Compile("\<[\S\s]+?\>")
        src = re.ReplaceAllString(src, "
    ")
        //去除连续的换行符
        re, _ = regexp.Compile("\s{2,}")
        src = re.ReplaceAllString(src, "
    ")
        return strings.TrimSpace(src)
    }
    
    func main() {
        //爬虫实例
        ms := new(MySpider)
        //入口地址http://192.168.13.1:8080
        ms.indexUrl = "http://192.168.13.1:8080"
        ms.run()
    }
  • 相关阅读:
    Lombok Pojo默认初始值问题
    spring boot打包以及centos下部署
    Spring事件监听ApplicationListener源码流程分析
    synchronized是什么,用法及原理
    Spring动态切换数据源及事务
    linux环境中关闭tomcat,通过shutdown.sh无法彻底关闭--线程池
    LVS之DR模式
    LVS之ipvsadm命令
    LVS之NAT模式
    tcpdump抓包命令
  • 原文地址:https://www.cnblogs.com/pu369/p/12202845.html
Copyright © 2011-2022 走看看