zoukankan      html  css  js  c++  java
  • :幽灵蛛(pholcus)(三)--header get post学习资料

     转载请注明出处:http://www.cnblogs.com/SSSR/p/6349298.html

    get和post参考:http://ju.outofmemory.cn/entry/96382

    登录知乎:https://github.com/DeanThompson/zhihu-go

    并发:http://studygolang.com/articles/5658 

    https://sanwen8.cn/p/5985D5k.html

    分布式爬虫:https://www.v2ex.com/t/179342

    爬妹子图:http://www.thinksaas.cn/topics/0/425/425080.html

    user agent见以下的例子,随机选择user agent。

    package http
    import (
        "net/http"
        "io/ioutil"
        "fmt"
        //"net/url"
    	"log"
    	"strings"
    	"math/rand"
    	"time"
    	"regexp"
    	"encoding/xml"
    	
    )
    
    var atagRegExp = regexp.MustCompile(`<a[^>]+[(href)|(HREF)]s*	*
    *=s*	*
    *[(".+")|('.+')][^>]*>[^<]*</a>`) //以Must前缀的方法或函数都是必须保证一定能执行成功的,否则将引发一次panic
    func Spy(url string) {
        defer func() {
            if r := recover(); r != nil {
                log.Println("[E]", r)
            }
        }()
        req, _ := http.NewRequest("GET", url, nil)
        req.Header.Set("User-Agent", GetRandomUserAgent())
        client := http.DefaultClient
        res, e := client.Do(req)
        if e != nil {
            fmt.Errorf("Get请求%s返回错误:%s", url, e)
            return
        }
    
        if res.StatusCode == 200 {
            body := res.Body
            defer body.Close()
            bodyByte, _ := ioutil.ReadAll(body)
            resStr := string(bodyByte)
            atag := atagRegExp.FindAllString(resStr, -1)
            for _, a := range atag {
                href,_ := GetHref(a)
                if strings.Contains(href, "article/details/") {
                    fmt.Println("☆", href)
                }else {
                    fmt.Println("□", href)
                }
                //urlChannel <- href
    			fmt.Println(href)
            }
        }
    }
    
    var userAgent = [...]string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
        "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
        "Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
        "Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
        "Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
        "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
        "Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
        "Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
        "Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}
    
    var r = rand.New(rand.NewSource(time.Now().UnixNano()))
    func GetRandomUserAgent() string {
        return userAgent[r.Intn(len(userAgent))]
    }
    
    func GetHref(atag string) (href,content string) {
        inputReader := strings.NewReader(atag)
        decoder := xml.NewDecoder(inputReader)
        for t, err := decoder.Token(); err == nil; t, err = decoder.Token() {
            switch token := t.(type) {
            // 处理元素开始(标签)
            case xml.StartElement:
                for _, attr := range token.Attr {
                    attrName := attr.Name.Local
                    attrValue := attr.Value
                    if(strings.EqualFold(attrName,"href") || strings.EqualFold(attrName,"HREF")){
                        href = attrValue
                    }
                }
            // 处理元素结束(标签)
            case xml.EndElement:
            // 处理字符数据(这里就是元素的文本)
            case xml.CharData:
                content = string([]byte(token))
            default:
                href = ""
                content = ""
            }
        }
        return href, content
    }
    

      

  • 相关阅读:
    webdriver学习
    [Sqlite]-->Java使用jdbc连接Sqlite数据库进行各种数据操作的详细过程(转)
    java 二维码
    java 解析json超大文件(转)
    嵌套三目运算符
    实体的字段以is开头的教训
    easyui中formatter的使用
    springmvc中的controller是单例的
    hibernate 中baseservice中添加事物
    easyui中添加富文本编辑器
  • 原文地址:https://www.cnblogs.com/SSSR/p/6349298.html
Copyright © 2011-2022 走看看