package main
import (
"fmt"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"io/ioutil"
"net/http"
"strconv"
"strings"
"sync"
"time"
)
var (
url = "https://www.woyaogexing.com/shouji/"
referUrl = "https://www.woyaogexing.com/shouji/"
referImg = "img2.woyaogexing.com"
)
func downloadUrl(url string, refer string) []byte {
client := &http.Client{}
req, e := http.NewRequest("GET", url, nil)
handError(e)
req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
req.Header.Add("Referer", refer)
response, err := client.Do(req)
handError(err)
defer response.Body.Close()
byteContent, e := ioutil.ReadAll(response.Body)
handError(e)
return byteContent
}
func parseContent(content []byte) []string {
reader := strings.NewReader(string(content))
html_node, i := html.Parse(reader)
handError(i)
nodes, e := htmlquery.QueryAll(html_node, "//img/@src")
handError(e)
var urls []string
for _, n := range nodes {
src := htmlquery.SelectAttr(n, "src")
urls = append(urls, src)
}
return urls
}
func downloadImgs(url string, refer string,wg *sync.WaitGroup) {
prefix := strings.HasPrefix(url, "//img2")
if prefix != true {
return
}
defer wg.Done()
url = url[2:]
url = "http://"+url
fmt.Println("下载图片", url)
content := downloadUrl(url, referUrl)
str1 := strings.Split(url, "/")
file_name := str1[len(str1)-1]
file := ioutil.WriteFile("./imgs/"+file_name, content, 0777)
if file != nil {
fmt.Printf("下载图片%s 成功", file_name)
}
}
func handError(err error) {
if err != nil {
fmt.Println(err)
}
}
func main() {
var wg sync.WaitGroup
var totalPage = 10
for j:=0;j<=totalPage;j++{
wg.Add(1)
pageUrl := url+"index_"+strconv.Itoa(j) +".html"
go crawl(pageUrl)
wg.Done()
}
wg.Wait()
time.Sleep(time.Second * 100)
}
func crawl(url string ) {
var wg sync.WaitGroup
byteContent := downloadUrl(url,referUrl)
urls := parseContent(byteContent)
fmt.Println(urls)
if len(urls) > 0 {
wg.Add(len(urls))
for _, v := range urls {
go downloadImgs(v, referImg,&wg)
}
wg.Wait()
}
}