zoukankan      html  css  js  c++  java
  • go 爬虫 colly 使用xpath解析

    package main
    
    import (
    	"fmt"
    	"github.com/antchfx/htmlquery"
    	"github.com/gocolly/colly"
    	"log"
    	"strings"
    	"time"
    )
    
    func main() {
    	c := colly.NewCollector(
    		colly.AllowedDomains("yeves.cn"),
    		)
    
    	c.OnRequest(func(r *colly.Request) {
    		fmt.Println("Visiting", r.URL.String())
    	})
    
    
    	c.Limit(&colly.LimitRule{
    		DomainGlob:  "*",
    		RandomDelay: 1 * time.Second,
    	})
    
    	//收到响应后
    	c.OnResponse(func(r *colly.Response) {
    		doc, err := htmlquery.Parse(strings.NewReader(string(r.Body)))
    		if err != nil {
    			log.Fatal(err)
    		}
    		nodes := htmlquery.Find(doc, `//*[@id="secondary"]/section[2]/ul//li`)
    		for _, node := range nodes {
    			a := htmlquery.FindOne(node, "./a[@href]")
    			fmt.Println(htmlquery.SelectAttr(a,"href"),htmlquery.InnerText(a))
    		}
    	})//因为
    
    
    	c.Visit("https://yeves.cn/")
    }
    
  • 相关阅读:
    POJ 1936 All in All
    POJ 2305 Basic remains
    POJ 2081 Recaman's Sequence
    MFC MDI 窗口函数执行顺序
    decompose
    不新建一个文档
    code mistake
    ...
    paper
    stereo
  • 原文地址:https://www.cnblogs.com/brady-wang/p/14004597.html
Copyright © 2011-2022 走看看