zoukankan      html  css  js  c++  java
  • 【Golang】爬虫笔记


    阅读目录

    一、net/http

    二、grequests

    三、实战应用

    一、net/http

    net/http包提供了http客户端及服务端的实现

    1.简单使用

    通过http最常见的几种请求方式(GET、POST、PUT、DELETE)演示net/http使用

    GET

    func get(){
    	resp, err := http.Get("http://httpbin.org/get")
    	if err != nil{
    		panic(err)
    	}
    	defer func() {_:resp.Body.Close()}()
    	content, err := ioutil.ReadAll(resp.Body)
    	if err != nil{
    		panic(err)
    	}
    	fmt.Printf("%s", content)
    }
    

    POST

    func post(){
    	resp, err := http.Post("http://httpbin.org/post", "", nil)
    	if err != nil{
    		panic(err)
    	}
    	defer func() {_:resp.Body.Close()}()
    	content, err := ioutil.ReadAll(resp.Body)
    	if err != nil{
    		panic(err)
    	}
    	fmt.Printf("%s", content)
    }
    

    PUT

    func put(){
    	request, err := http.NewRequest(http.MethodPut, "http://httpbin.org/put", nil)
    	if err != nil{
    		panic(err)
    	}
    	resp, err := http.DefaultClient.Do(request)
    	defer func() {_:resp.Body.Close()}()
    	content, err := ioutil.ReadAll(resp.Body)
    	if err != nil{
    		panic(err)
    	}
    	fmt.Printf("%s", content)
    }
    

    DELETE

    func del(){
    	request, err := http.NewRequest(http.MethodDelete, "http://httpbin.org/delete", nil)
    	if err != nil{
    		panic(err)
    	}
    	resp, err := http.DefaultClient.Do(request)
    	defer func() {_:resp.Body.Close()}()
    	content, err := ioutil.ReadAll(resp.Body)
    	if err != nil{
    		panic(err)
    	}
    	fmt.Printf("%s", content)
    }
    

    2.请求参数

    GET请求参数

    func getWithParams(){
    	request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/get", nil)
    	if err != nil{
    		panic(err)
    	}
    	params := make(url.Values)
    	params.Add("name", "ero")
    	params.Add("arg", "18")
    	request.URL.RawQuery = params.Encode()
            // 或者
            // request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/get?name=ero&age=18", nil)
    
    	resp, err := http.DefaultClient.Do(request)
    	defer func() {_:resp.Body.Close()}()
    	content, err := ioutil.ReadAll(resp.Body)
    	if err != nil{
    		panic(err)
    	}
    	fmt.Printf("%s", content)
    }
    

    POST请求参数

    form表单
    func postForm(){
    	data := make(url.Values)
    	data.Add("name", "ero")
    	data.Add("age", "18")
    	resp, err := http.Post("http://httpbin.org/post",
    		"application/x-www-form-urlencoded",
    		strings.NewReader(data.Encode()))
    	if err != nil{
    		panic(err)
    	}
    	defer func() {_=resp.Body.Close()}()
    
    	content, err := ioutil.ReadAll(resp.Body)
    	fmt.Printf("%s", content)
    }
    
    Json数据
    func postJson(){
    	u := struct {
    		Name string `json:"name"`
    		Age int `json:"age"`
    	}{
    		Name:"ero",
    		Age:18,
    	}
    	payload, _ := json.Marshal(u)
    	resp, _ := http.Post("http://httpbin.org/post", "application/json",
    		bytes.NewReader(payload))
    	defer func() {_=resp.Body.Close()}()
    
    	content, _ := ioutil.ReadAll(resp.Body)
    	fmt.Printf("%s", content)
    }
    
    文件
    func postFile(){
            //缓冲对象
    	body := &bytes.Buffer{}
    	writer := multipart.NewWriter(body)
            // 也可以写入form格式数据
    	_ = writer.WriteField("number", "123456")
    
    	//一个是数据表单的name,另一个是上传文件的名称
    	uploadWriter1, _ := writer.CreateFormFile("uploadFile1", "uploadFileName1")
    	uploadFile1, err := os.Open("uploadFileName1.txt")
    	if err != nil{
    		panic(err)
    	}
    	defer uploadFile1.Close()
    	_, _ = io.Copy(uploadWriter1, uploadFile1)
    
    	uploadWriter2, _ := writer.CreateFormFile("uploadFile2", "uploadFileName2")
    	uploadFile2, _ := os.Open("uploadFileName2.txt")
    	defer uploadFile2.Close()
    	_,_ = io.Copy(uploadWriter2, uploadFile2)
    
    	_ = writer.Close()
    	fmt.Println(body.String())
    
    
    	resp, _ := http.Post("http://httpbin.org/post", writer.FormDataContentType(), body)
    	defer resp.Body.Close()
    	content, _ := ioutil.ReadAll(resp.Body)
    	fmt.Printf("%s", content)
    }
    

    3.请求头

    func getWithHeaders(){
    	request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/get", nil)
    	if err != nil{
    		panic(err)
    	}
    
    	request.Header.Add("name","ero")
    	request.Header.Add("user-agent","chrome")
    
    	resp, err := http.DefaultClient.Do(request)
    	defer func() {_:resp.Body.Close()}()
    	content, err := ioutil.ReadAll(resp.Body)
    	if err != nil{
    		panic(err)
    	}
    	fmt.Printf("%s", content)
    }
    

    4.Cookie信息

    手动保存

    func manualSetCookies(){
    	client := &http.Client{
    		Transport:     nil,
    		CheckRedirect: func(req *http.Request, via []*http.Request) error {
    			// 禁止重定向
    			return http.ErrUseLastResponse
    		},
    		Jar:           nil,
    		Timeout:       0,
    	}
    	firstRequest, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies/set?name=ero&pwd=123", nil)
    	resp, err := client.Do(firstRequest)
    	if err != nil{
    		panic(err)
    	}
    	defer resp.Body.Close()
    	fmt.Printf("%s
    ", resp.Cookies())
    	secondRequest, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies", nil)
    	for _, cookie := range resp.Cookies(){
    		secondRequest.AddCookie(cookie)
    	}
    	resp2, err := client.Do(secondRequest)
    	if err != nil{
    		panic(err)
    	}
    	defer resp2.Body.Close()
    	fmt.Printf("%s
    ", resp2.Cookies())
    
    	content, _ := ioutil.ReadAll(resp2.Body)
    	fmt.Printf("%s", content)
    }
    

    cookiejar自动保存

    func autoSetCookies(){
    	jar,_ := cookiejar.New(nil)
    	client := &http.Client{Jar:jar}
    	firstRequest, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies/set?name=ero&pwd=123", nil)
    	resp, err := client.Do(firstRequest)
    	if err != nil{
    		panic(err)
    	}
    
    	defer resp.Body.Close()
    
    	_, _ = io.Copy(os.Stdout, resp.Body)
    }
    

    cookie保存持久化

    浏览器访问会自动做cookie信息持久化,但是通过cookiejar保存的cookie只支持一次会话
    可以通过github.com/juju/persistent-cookiejar做持久化cookie

    jar, _ := cookiejar2.New(nil)
    ...
    //在执行最后设置保存
    _ = jar.Save()
    

    5.响应信息

    状态码

    func status(r *http.Response){
    	fmt.Println(r.Status)      //状态码字符串
    	fmt.Println(r.StatusCode)  //状态码
    }
    

    响应头

    func headers(r *http.Response){
    	s := r.Header.Get("name")
    	fmt.Println(s)
    }
    

    编码

    func encoding(r *http.Response){
    	reader := bufio.NewReader(r.Body)
    	//不会移动reader读取位置
    	bytes, err := reader.Peek(100)
    	if err != nil{
    		fmt.Println(err.Error())
    		panic(err)
    	}
    	// 可以相对准确获取编码格式
    	e, _, _ := charset.DetermineEncoding(bytes, r.Header.Get("content-type"))
    	fmt.Println(e.NewDecoder())
    	// 获取解码信息
    	bodyReader := transform.NewReader(reader, e.NewDecoder())
            // 读取解码后的信息
    	content, _ := ioutil.ReadAll(bodyReader)
    	fmt.Println(string(content))
    }
    

    6.超时时间

    func timeoutTest(){
    	client := &http.Client{
                    // 通过transport更详细设置超时时间
    		Transport:     &http.Transport{
    			DialContext: func(ctx context.Context, network, addr string) (conn net.Conn, err error) {
    				return net.DialTimeout(network, addr, 2*time.Second)
    			},
    			Dial:                   nil,
    			DialTLS:                nil,
    			TLSClientConfig:        nil,
    			TLSHandshakeTimeout:    5 * time.Second,
    			IdleConnTimeout:        0,
    			ResponseHeaderTimeout:  5 * time.Second,
    			ExpectContinueTimeout:  0,
    		},
    		CheckRedirect: nil,
    		Jar:           nil,
    		Timeout:       5 * time.Second,
    	}
    	request, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/delay/10", nil)
    	response, err := client.Do(request)
    	if err !=nil{
    		panic(err)
    	}
    	fmt.Printf("%s", response.Body)
    }
    

    7.代理

    func main() {
    	//proxyUrl, _ := url.Parse("socks5://127.0.0.1:1080")  //socks5代理
    	proxyUrl, _ := url.Parse("http://127.0.0.1:1080")      //http代理
    	t := &http.Transport{
    		Proxy: http.ProxyURL(proxyUrl),
    	}
    	client := &http.Client{Transport:t,}
    	resp, err := client.Get("https://www.google.com")
    	if err != nil{
    		panic(err)
    	}
    	defer resp.Body.Close()
    	fmt.Printf("%s", resp.Body)
    }
    

    8.重定向

    限制重定向次数

    func redirectLimitTimes(){
    	client := http.Client{
    		CheckRedirect: func(req *http.Request, via []*http.Request) error {
    			if len(via) > 10{
    				return errors.New("redirect times > 10")
    			}
    			return nil
    		},
    	}
    
    	request, err := http.NewRequest(http.MethodGet, "http://httpbin.org/redirect/20", nil)
    	if err != nil{
    		panic(err)
    	}
    	_, err = client.Do(request)
    	if err != nil{
    		panic(err)
    	}
    
    }
    

    禁止重定向

    func redirectForbidden(){
    	client := &http.Client{
    		CheckRedirect: func(req *http.Request, via []*http.Request) error {
    			return http.ErrUseLastResponse
    		},
    	}
    	request, _ := http.NewRequest(http.MethodGet, "http://httpbin.org/cookies/set?name=ero", nil)
    
    	//resp, err := http.DefaultClient.Do(request)  //默认会被重新定向到另一个地址
    	resp, err := client.Do(request)  //禁止重定向
    	if err != nil{
    		panic(err)
    	}
    	defer func() {_ = resp.Body.Close()}()
    
    	fmt.Printf("%s", resp.Request.URL)
    
    }
    

    9.下载

    简单下载

    func downloadFile(url, filename string){
    	r, err := http.Get(url)
    	if err != nil{
    		panic(err)
    	}
    	defer func() {_=r.Body.Close()}()
    
    	f, err:= os.Create(filename)
    	if err != nil{
    		panic(err)
    	}
    
    	written, err := io.Copy(f, r.Body)
    	if err != nil{
    		panic(err)
    	}
    	fmt.Println(written)
    }
    

    下载进度

    type Reader struct {
    	io.Reader
    	Total int64
    	CurrentLength int64
    }
    // 实现接口方法
    func(r *Reader) Read(p []byte) (n int, err error){
    	n, err = r.Reader.Read(p)
    	r.CurrentLength += int64(n)
    	fmt.Printf("
    进度 %.2f%%
    ", float64(r.CurrentLength*10000/r.Total)/100)
    	return
    }
    
    func downloadFileProgress(url, filename string){
    	r, err := http.Get(url)
    	if err != nil{
    		panic(err)
    	}
    	defer func() {_=r.Body.Close()}()
    
    	f, err:= os.Create(filename)
    	if err != nil{
    		panic(err)
    	}
    	reader := &Reader{
    		Reader:        r.Body,
    		Total:         r.ContentLength,
    	}
    	n, err := io.Copy(f, reader)
    	if err != nil{
    		panic(err)
    	}
    	fmt.Println(n)
    }
    

    二、grequests

    语法更简单,使用过python requests包的同学肯定会很喜欢

    1.安装

    go get -u github.com/levigross/grequests
    

    2.GET示例

    import "github.com/levigross/grequests"
    
    ro := &RequestOptions{
    	Params: map[string]string{"Hello": "Goodbye"},
    }
    // url路径上的参数会被覆盖
    resp, err := grequests.Get("http://httpbin.org/get?Hello=11", ro )
    
    if err != nil {
    	log.Fatalln("Unable to make request: ", err)
    }
    
    fmt.Println(resp.String())
    

    3.POST示例

    resp, err := grequests.Post("http://httpbin.org/post",
        &grequests.RequestOptions{Data: map[string]string{"One": "Two"}})
    
    if err != nil {
        log.Println("Cannot post: ", err)
    }
    
    if resp.Ok != true {
        log.Println("Request did not return OK")
    }
    

    4.POST上传文件

    fd, err := grequests.FileUploadFromDisk("test_files/mypassword")
    
    if err != nil {
        log.Println("Unable to open file: ", err)
    }
    
    // This will upload the file as a multipart mime request
    resp, err := grequests.Post("http://httpbin.org/post",
        &grequests.RequestOptions{
            Files: fd,
            Data:  map[string]string{"One": "Two"},
        })
    
    if err != nil {
        log.Println("Unable to make request", resp.Error)
    }
    
    if resp.Ok != true {
        log.Println("Request did not return OK")
    }
    

    三、实战应用

  • 相关阅读:
    ParksLink修改密码
    ORA-01940:无法删除当前已链接的用户
    imp导入数据的时候报错:ORA-01658: 无法为表空间 MAXDATA 中的段创建 INITIAL 区
    Linux下查看日志用到的常用命令
    大批量数据高效插入数据库表
    线程中断:Thread类中interrupt()、interrupted()和 isInterrupted()方法详解
    CyclicBarrier、CountDownLatch、Callable、FutureTask、thread.join() 、wait()、notify()、Condition
    Mysql全文索引
    Docker 镜像的常用操作
    Docker 入门
  • 原文地址:https://www.cnblogs.com/zhangliang91/p/12814057.html
Copyright © 2011-2022 走看看