zoukankan      html  css  js  c++  java
  • go爬取博客园

    package main
    
    import (
    	"bufio"
    	"fmt"
    	"github.com/antchfx/htmlquery"
    	"io/ioutil"
    	"net/http"
    	"os"
    	"strconv"
    	"time"
    )
    
    func getResponse(url string ) *http.Response {
    
    	client := &http.Client{}
    	//生成要访问的url
    	//提交请求
    	request, err := http.NewRequest("GET", url, nil)
    
    	//增加header选项
    	request.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
    	request.Header.Add("Referer", "https://www.cnblogs.com/brady-wang/default.html?page=167")
    
    	if err != nil {
    		panic(err)
    	}
    	//处理返回结果
    	resp, _ := client.Do(request)
    
    	return resp
    }
    
    func main() {
    
    		var url string = "https://www.cnblogs.com/brady-wang/default.html?page="
    		pages := 167
    		for i:=1;i<=pages;i++{
    			newUrl := url + strconv.Itoa(i)
    			fmt.Printf("crawl page %s
    ",newUrl)
    			urlList := getUrls(newUrl)
    			if len(urlList) > 0{
    				for _,detailUrl := range urlList{
    					//fmt.Printf("crawl 详情页面 %s
    ",detailUrl)
    					title := getDetail(detailUrl)
    					fmt.Printf("%s
    ",title)
    					time.Sleep(time.Microsecond*3000)
    				}
    			}
    		}
    
    }
    
    func getDetail(url string) string  {
    	response := getResponse(url)
    	defer response.Body.Close()
    	doc,err := htmlquery.Parse(response.Body)
    
    	if err !=nil{
    		return ""
    	}
    
    	titleNodes := htmlquery.Find(doc,"//*[@id='cb_post_title_url']/span")
    	if len(titleNodes) >0{
    		title := htmlquery.InnerText(titleNodes[0])
    		return title
    	} else {
    		return ""
    	}
    
    
    }
    
    func getUrls(url string )[]string  {
    	response := getResponse(url)
    	defer response.Body.Close()
    
    	doc,_ := htmlquery.Parse(response.Body)
    	list := htmlquery.Find(doc,"//*[@id='mainContent']//div[@class='postTitle']/a")
    	var urls = make([]string,0)
    	for _,item := range list{
    		url := htmlquery.SelectAttr(item,"href")
    		urls = append(urls, url)
    	}
    	return urls
    }
    
    func downloadXiaoshuo(url string)  {
    	response := getResponse(url)
    	defer response.Body.Close()
    	body,err := ioutil.ReadAll(response.Body)
    	if err != nil{
    		fmt.Println(err)
    	}
    	writeToFile(string(body))
    }
    
    func writeToFile(str string )  {
    	filePath := "./a.txt"
    	file, err := os.OpenFile(filePath, os.O_WRONLY | os.O_CREATE, 0666)
    	if err != nil {
    		fmt.Printf("open file err=%v
    ", err)
    		return
    	}
    	//及时关闭file句柄
    	defer file.Close()
    	//写入时,使用带缓存的 *Writer
    	writer := bufio.NewWriter(file)
    	for i := 0; i < 5; i++ {
    		writer.WriteString(str)
    	}
    
    }
    
  • 相关阅读:
    缓存---缓存位置
    缓存---LRU算法实现
    缓存---缓存特征
    Redis---分片
    Redis---复制
    Redis---事件
    Redis---事务
    Redis---持久化
    javaSript 处理电脑和浏览器pc端缩放对页面的影响
    css设置不可复制
  • 原文地址:https://www.cnblogs.com/brady-wang/p/14308624.html
Copyright © 2011-2022 走看看