zoukankan      html  css  js  c++  java
  • go爬取博客园

    package main
    
    import (
    	"bufio"
    	"fmt"
    	"github.com/antchfx/htmlquery"
    	"io/ioutil"
    	"net/http"
    	"os"
    	"strconv"
    	"time"
    )
    
    func getResponse(url string ) *http.Response {
    
    	client := &http.Client{}
    	//生成要访问的url
    	//提交请求
    	request, err := http.NewRequest("GET", url, nil)
    
    	//增加header选项
    	request.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
    	request.Header.Add("Referer", "https://www.cnblogs.com/brady-wang/default.html?page=167")
    
    	if err != nil {
    		panic(err)
    	}
    	//处理返回结果
    	resp, _ := client.Do(request)
    
    	return resp
    }
    
    func main() {
    
    		var url string = "https://www.cnblogs.com/brady-wang/default.html?page="
    		pages := 167
    		for i:=1;i<=pages;i++{
    			newUrl := url + strconv.Itoa(i)
    			fmt.Printf("crawl page %s
    ",newUrl)
    			urlList := getUrls(newUrl)
    			if len(urlList) > 0{
    				for _,detailUrl := range urlList{
    					//fmt.Printf("crawl 详情页面 %s
    ",detailUrl)
    					title := getDetail(detailUrl)
    					fmt.Printf("%s
    ",title)
    					time.Sleep(time.Microsecond*3000)
    				}
    			}
    		}
    
    }
    
    func getDetail(url string) string  {
    	response := getResponse(url)
    	defer response.Body.Close()
    	doc,err := htmlquery.Parse(response.Body)
    
    	if err !=nil{
    		return ""
    	}
    
    	titleNodes := htmlquery.Find(doc,"//*[@id='cb_post_title_url']/span")
    	if len(titleNodes) >0{
    		title := htmlquery.InnerText(titleNodes[0])
    		return title
    	} else {
    		return ""
    	}
    
    
    }
    
    func getUrls(url string )[]string  {
    	response := getResponse(url)
    	defer response.Body.Close()
    
    	doc,_ := htmlquery.Parse(response.Body)
    	list := htmlquery.Find(doc,"//*[@id='mainContent']//div[@class='postTitle']/a")
    	var urls = make([]string,0)
    	for _,item := range list{
    		url := htmlquery.SelectAttr(item,"href")
    		urls = append(urls, url)
    	}
    	return urls
    }
    
    func downloadXiaoshuo(url string)  {
    	response := getResponse(url)
    	defer response.Body.Close()
    	body,err := ioutil.ReadAll(response.Body)
    	if err != nil{
    		fmt.Println(err)
    	}
    	writeToFile(string(body))
    }
    
    func writeToFile(str string )  {
    	filePath := "./a.txt"
    	file, err := os.OpenFile(filePath, os.O_WRONLY | os.O_CREATE, 0666)
    	if err != nil {
    		fmt.Printf("open file err=%v
    ", err)
    		return
    	}
    	//及时关闭file句柄
    	defer file.Close()
    	//写入时,使用带缓存的 *Writer
    	writer := bufio.NewWriter(file)
    	for i := 0; i < 5; i++ {
    		writer.WriteString(str)
    	}
    
    }
    
  • 相关阅读:
    Cuckoo Hashing
    Microsoft Windows的消息循环
    Simplex, Full-Duplex and Half-Duplex Operation
    Linux 技巧:让进程在后台运行更可靠的几种方法
    https://learnku.com/docs/go-blog/qihoo/6532 。 heap size went up to 69G, with maximum garbage collection (GC)
    Go GC: Latency Problem Solved
    Sapphire: Copying GC Without Stopping the World
    The Go Blog Getting to Go: The Journey of Go's Garbage Collector
    xml CDATA
    Joseph cicyle's algorithm
  • 原文地址:https://www.cnblogs.com/brady-wang/p/14308624.html
Copyright © 2011-2022 走看看