zoukankan      html  css  js  c++  java
  • 纯golang爬虫实战(二)-爬取内网网站信息

    接上一篇文章https://www.cnblogs.com/pu369/p/12202845.html只讲了原理,抽时间写个了实用版,将员工信息爬取到一个TXT文档中,以便于查询,上代码:

    //纯golang爬虫
    package main
    
    import (
        "bytes"
        "fmt"
        "io/ioutil"
        "net/http"
        "net/http/cookiejar"
        "regexp"
        "strings"
    )
    
    type MySpider struct {
        indexUrl string
        cleint   *http.Client
        buf      *bytes.Buffer
    }
    
    //登录,用GET代替POST请求
    func (this MySpider) login() (string, error) {
        //访问首页
        resp, err := this.cleint.Get(this.indexUrl)
        //访问登录页
        resp, err = this.cleint.Get("http://192.168.13.1:8080/login/auth?name=XX&password=XX&scurity=s&type=0&typeField=0")
        body, err := ioutil.ReadAll(resp.Body)
        defer resp.Body.Close()
        if err != nil {
            return "err", err
        }
        //fmt.Print(string(body))
        //trimbody := []byte(trimHtml(string(body)))
        //循环访问
        this.saveall()
        return string(body), err
    }
    
    //循环下载所有内容、保存到bytes.buffer,最后统一写入文件(如果数据量太大可能会崩溃)
    func (this MySpider) saveall() (string, error) {
        //最小id是2,但id是8位字符串,不足8位在前面补0
        for id := 2; id < 20000; id++ {
            idstr := fmt.Sprintf("%08d", id)
            //fmt.Println(idstr)
    
            //员工记录主页面
            url := "http://192.168.13.1:8080/browse/basicinfo_p.jsp?rtpage=psnfrm&pid=" + idstr + "&func=0297&userbase=Usr"
            this.saveone(url, idstr)
        }
        //fmt.Print("buf:", this.buf.String())
        //保存到文件
        err := ioutil.WriteFile("hrp.txt", this.buf.Bytes(), 0644)
        if err != nil {
            return "err", err
        }
        return "", err
    }
    
    //下载某人员的主页面内容、保存到bytes.buffer
    func (this MySpider) saveone(url, idstr string) (string, error) {
        resp, err := this.cleint.Get(url)
        if err != nil {
            return "err", err
        }
        body, err := ioutil.ReadAll(resp.Body)
        defer resp.Body.Close()
        stringbody := string(body)
        //fmt.Print(string(body))
        //判断主页面是否包含字样:Apache Tomcat/5.0.19 - Error report |  HTTP Status 500
        if ko := strings.Contains(stringbody, "Apache"); !ko {
            //主页面正常,则保存
            this.buf.Write([]byte(idstr + "
    "))
            trimbody := []byte(trimHtml(stringbody))
            this.buf.Write(trimbody)
            this.buf.Write([]byte("
    "))
            //有主页面,则下载辅助页面
            //员工记录附加页面-1学历
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A04&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-2岗位
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A17&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-3简历
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A19&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-4合同
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=AZ3&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-5流动
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A16&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-6关系
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A79&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-7家庭
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A82&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-8聘任
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=AZT&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-9职务
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A07&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-10专业
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A10&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-11工人
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A13&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-12奖励
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A28&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            //员工记录附加页面-13惩罚
            url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A29&pid=" + idstr + "&userbase=Usr"
            this.saveonemore(url)
            this.buf.Write([]byte("
    
    "))
        }
        return "", err
    }
    
    //下载某人员的辅助页面内容、保存到bytes.buffer
    func (this MySpider) saveonemore(url string) (string, error) {
        resp, err := this.cleint.Get(url)
        if err != nil {
            return "err", err
        }
        body, err := ioutil.ReadAll(resp.Body)
        defer resp.Body.Close()
        stringbody := string(body)
        trimbody := []byte(trimHtml(stringbody))
        this.buf.Write(trimbody)
        this.buf.Write([]byte("
    "))
        return "", err
    }
    
    //去除Html标签
    func trimHtml(src string) string {
        //将HTML标签全转换成小写
        re, _ := regexp.Compile("\<[\S\s]+?\>")
        src = re.ReplaceAllStringFunc(src, strings.ToLower)
        //去除STYLE
        re, _ = regexp.Compile("\<style[\S\s]+?\</style\>")
        src = re.ReplaceAllString(src, " ")
        //去除SCRIPT
        re, _ = regexp.Compile("\<script[\S\s]+?\</script\>")
        src = re.ReplaceAllString(src, " ")
        //去除所有尖括号内的HTML代码,并换成换行符
        re, _ = regexp.Compile("\<[\S\s]+?\>")
        src = re.ReplaceAllString(src, " ")
        //去除&nbsp
        re, _ = regexp.Compile("&nbsp")
        src = re.ReplaceAllString(src, " ")
        //去除连续的换行符
        re, _ = regexp.Compile("\s{2,}")
        src = re.ReplaceAllString(src, " |  ")
        return strings.TrimSpace(src)
    }
    
    //运行
    func (this MySpider) run() string {
        //生成可复用的client
        var client http.Client
        jar, err := cookiejar.New(nil)
        if err != nil {
            panic(err)
        }
        client.Jar = jar
        this.cleint = &client
        //登录,用GET代替POST请求
        this.login()
        return ""
    }
    
    func main() {
        //爬虫实例
        ms := new(MySpider)
        //入口地址http://192.168.13.1:8080
        ms.indexUrl = "http://192.168.13.1:8080"
        ms.buf = bytes.NewBuffer([]byte{})
        ms.run()
    }
  • 相关阅读:
    XML 文档(1, 2)中有错误:不应有 <xml xmlns=''>
    工厂模式、控制反转及依赖注入
    [译]新的CCSDS图像压缩推荐标准
    关于C#的静态类和静态构造函数
    c++友元函数与友元类
    malloc/free 与 new/delete 比较
    C++ main函数命令行参数使用
    进程和线程的区别
    堆和栈的区别
    java与C++的区别
  • 原文地址:https://www.cnblogs.com/pu369/p/12228401.html
Copyright © 2011-2022 走看看