zoukankan      html  css  js  c++  java
  • go 多协程爬取图片

    package main
    
    import (
    	"fmt"
    	"github.com/antchfx/htmlquery"
    	"golang.org/x/net/html"
    	"io/ioutil"
    	"net/http"
    	"strconv"
    	"strings"
    	"sync"
    	"time"
    )
    
    var (
    	url      = "https://www.woyaogexing.com/shouji/"
    	referUrl = "https://www.woyaogexing.com/shouji/"
    	referImg = "img2.woyaogexing.com"
    )
    
    func downloadUrl(url string, refer string) []byte {
    
    	client := &http.Client{}
    	req, e := http.NewRequest("GET", url, nil)
    	handError(e)
    
    	req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
    	req.Header.Add("Referer", refer)
    	response, err := client.Do(req)
    	handError(err)
    
    	defer response.Body.Close()
    
    	byteContent, e := ioutil.ReadAll(response.Body)
    	handError(e)
    	return byteContent
    }
    
    func parseContent(content []byte) []string {
    	reader := strings.NewReader(string(content))
    	html_node, i := html.Parse(reader)
    	handError(i)
    
    	nodes, e := htmlquery.QueryAll(html_node, "//img/@src")
    
    	handError(e)
    	var urls []string
    	for _, n := range nodes {
    		src := htmlquery.SelectAttr(n, "src")
    		urls = append(urls, src)
    	}
    	return urls
    }
    
    func downloadImgs(url string, refer string,wg *sync.WaitGroup) {
    	prefix := strings.HasPrefix(url, "//img2")
    	if prefix != true {
    		return
    	}
    	defer wg.Done()
    	url  = url[2:]
    	url = "http://"+url
    	fmt.Println("下载图片", url)
    	content := downloadUrl(url, referUrl)
    	str1 := strings.Split(url, "/")
    	file_name := str1[len(str1)-1]
    	file := ioutil.WriteFile("./imgs/"+file_name, content, 0777)
    	if file != nil {
    		fmt.Printf("下载图片%s 成功", file_name)
    	}
    }
    
    func handError(err error) {
    	if err != nil {
    		fmt.Println(err)
    	}
    }
    
    func main() {
    
    	var wg sync.WaitGroup
    
    	var totalPage = 10
    	for j:=0;j<=totalPage;j++{
    		wg.Add(1)
    		pageUrl := url+"index_"+strconv.Itoa(j) +".html"
    		go crawl(pageUrl)
    		wg.Done()
    	}
    	wg.Wait()
    
    	time.Sleep(time.Second * 100)
    }
    
    func crawl(url string )  {
    	var wg sync.WaitGroup
    
    	byteContent := downloadUrl(url,referUrl)
    	urls := parseContent(byteContent)
    	fmt.Println(urls)
    	if len(urls) > 0 {
    		wg.Add(len(urls))
    		for _, v := range urls {
    			go downloadImgs(v, referImg,&wg)
    		}
    		wg.Wait()
    	}
    }
    

      

  • 相关阅读:
    Hive_MySQL安装
    Hive_安装部署
    Hive_和关系数据库比较
    Hive_架构原理
    Hive_优缺点
    Hive_什么是Hive
    java中的编码规范(1)
    SpringBoot_常用注解
    什么是WebMvcConfigurer
    什么是大数据倾斜
  • 原文地址:https://www.cnblogs.com/php-linux/p/13098687.html
Copyright © 2011-2022 走看看