zoukankan      html  css  js  c++  java
  • go语音之进阶篇爬百度贴吧单线程版本

    一、爬什么?

    1、明确目标 : 知道你准备在那个范围或者网站去搜索

    2、爬: 将所有的网站的内容全部爬下来

    3、取:去掉对我们没用处的数据

    4、处理数据:按照我们想要的方式存储或使用

    二、百度贴吧小爬虫

    需求:百度贴吧,每50页就翻页

    https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=0

    https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=50

    示例: 单线程版本

    package main
    
    import (
    	"fmt"
    	"net/http"
    	"os"
    	"strconv"
    )
    
    //爬取网页内容
    func HttpGet(url string) (result string, err error) {
    	resp, err1 := http.Get(url)
    	if err1 != nil {
    		err = err1
    		return
    	}
    
    	defer resp.Body.Close()
    
    	//读取网页body内容
    	buf := make([]byte, 1024*4)
    	for {
    		n, _ := resp.Body.Read(buf)
    		if n == 0 { //读取结束,或者,出问题
    			//fmt.Println("resp.Body.Read err = ", err)
    			break
    		}
    
    		result += string(buf[:n])
    	}
    
    	return
    }
    
    func DoWork(start, end int) {
    	fmt.Printf("正在爬取 %d 到 %d 的页面
    ", start, end)
    
    	//明确目标 (要知道你准备在哪个范围或者网站去搜索)
    	for i := start; i <= end; i++ {
    		url := "http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
    		fmt.Println("url = ", url)
    
    		//2) 爬 (将所有的网站的内容全部爬下来)
    		result, err := HttpGet(url)
    		if err != nil {
    			fmt.Println("HttpGet err = ", err)
    			continue
    		}
    
    		//把内容写入到文件
    		fileName := strconv.Itoa(i) + ".html"
    		f, err1 := os.Create(fileName)
    		if err1 != nil {
    			fmt.Println("os.Create err1 = ", err1)
    			continue
    		}
    
    		f.WriteString(result) //写内容
    
    		f.Close() //关闭文件
    	}
    
    }
    
    func main() {
    	var start, end int
    	fmt.Printf("请输入起始页( >= 1) :")
    	fmt.Scan(&start)
    	fmt.Printf("请输入终止页( >= 起始页) :")
    	fmt.Scan(&end)
    
    	DoWork(start, end)
    }
    

    执行结果:

    请输入起始页( >= 1) :1   //输入起始页
    请输入终止页( >= 起始页) :10  //输入终止页
    正在爬取 1 到 10 的页面
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=0
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=50
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=100
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=150
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=200
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=250
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=300
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=350
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=400
    url =  http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=450
    

      

  • 相关阅读:
    高精度计算
    高精度除以低精度
    P1258 小车问题
    POJ 2352 stars (树状数组入门经典!!!)
    HDU 3635 Dragon Balls(超级经典的带权并查集!!!新手入门)
    HDU 3938 Portal (离线并查集,此题思路很强!!!,得到所谓的距离很巧妙)
    POJ 1703 Find them, Catch them(确定元素归属集合的并查集)
    HDU Virtual Friends(超级经典的带权并查集)
    HDU 3047 Zjnu Stadium(带权并查集,难想到)
    HDU 3038 How Many Answers Are Wrong(带权并查集,真的很难想到是个并查集!!!)
  • 原文地址:https://www.cnblogs.com/nulige/p/10323630.html
Copyright © 2011-2022 走看看