zoukankan      html  css  js  c++  java
  • golang抓取soyoung新氧案例图

    时隔多年 感慨良多 废话不多说 上代码

    package main
    
    import (
    	"crypto/md5"
    	"encoding/hex"
    	"encoding/json"
    	"io/ioutil"
    	"log"
    	"net/http"
    	"os"
    	"runtime"
    	"strings"
    
    	"github.com/PuerkitoBio/goquery"
    	"github.com/axgle/mahonia"
    )
    
    // 字符转换
    func ConvertToString(src string, srcCode string, tagCode string) string {
    	srcCoder := mahonia.NewDecoder(srcCode)
    	srcResult := srcCoder.ConvertString(src)
    	tagCoder := mahonia.NewDecoder(tagCode)
    	_, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
    	result := string(cdata)
    	return result
    }
    
    func GbkToUtf8(src string) string {
    	return ConvertToString(src, "gbk", "utf-8")
    }
    
    //下载图片
    func downloadImg(img_url string, filename string, Referer string) {
    
    	req, _ := http.NewRequest("GET", img_url, nil)
    	req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36")
    	req.Header.Add("Referer", Referer)
    
    	client := &http.Client{}
    	response, err := client.Do(req)
    	if err != nil {
    		log.Println("get img_url failed:", err)
    		return
    	}
    	defer response.Body.Close()
    
    	data, err := ioutil.ReadAll(response.Body)
    	if err != nil {
    		log.Println("read data failed:", img_url, err)
    		return
    	}
    
    	image, err := os.Create(filename)
    	if err != nil {
    		log.Println("create file failed:", filename, err)
    		return
    	}
    	defer image.Close()
    
    	image.Write(data)
    
    }
    
    //抓取图片 id目录名
    func GetJokes(url string, id string) map[string]string {
    
    	// page one
    	baseUrl := url
    	client := &http.Client{}
    	req, err := http.NewRequest("GET", baseUrl, nil)
    	req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36")
    	req.Header.Add("Referer", "https://www.soyoung.com/")
    	req.Header.Add("Cookie", "__order_time__=undefined; msg_time=undefined; back_order_time=undefined; complain_time=undefined; __usersign__=1570614910876417305; _ga=GA1.2.2061581476.1570614904; _gid=GA1.2.1666843180.1570614904; PHPSESSID=5001a7796cc83a8255b33284a3a30dd7; cityId=1; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1570614904,1570693381; __p_t__=15706935958294; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bs%3A3%3A%22207%22%3Bs%3A8%3A%22cityName%22%3Bs%3A9%3A%22%E6%B3%89%E5%B7%9E%E5%B8%82%22%3Bs%3A8%3A%22cityCode%22%3Bs%3A3%3A%22134%22%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1570694344")
    	res, err := client.Do(req)
    	if err != nil {
    		log.Fatal(err)
    	}
    	defer res.Body.Close()
    
    	doc, err := goquery.NewDocumentFromResponse(res)
    	if err != nil {
    		log.Fatal(err)
    	}
    
    	//获取术前图片链接
    	var beforImgs []string
    	doc.Find(".big-photo").Each(func(i int, s *goquery.Selection) {
    
    		imgUrl, _ := s.Attr("href")
    		beforImgs = append(beforImgs, imgUrl)
    
    	})
    
    	err = os.MkdirAll("./"+id+"/before", os.ModePerm)
    	if err != nil {
    		log.Println(err)
    	} else {
    
    		for _, v := range beforImgs {
    			//下载图片
    			h := md5.New()
    			h.Write([]byte(v))
    			filename := hex.EncodeToString(h.Sum(nil)) + ".jpg"
    			downloadImg(v, "./"+id+"/before/"+filename, baseUrl)
    		}
    
    	}
    
    	//获取术后文字和图片链接 page one
    	doc.Find(".diary-item").Each(func(i int, s *goquery.Selection) {
    
    		title := s.Find(".day").Text()
    		os.Mkdir("./"+id+"/"+title, os.ModePerm)
    
    		s.Find(".photo-list li img").Each(func(k int, s2 *goquery.Selection) {
    
    			imgUrl, _ := s2.Attr("data-img")
    			imgUrl = strings.Replace(imgUrl, "_301_301", "", -1)
    			imgUrl = strings.Replace(imgUrl, "face/", "", -1)
    			h := md5.New()
    			h.Write([]byte(imgUrl))
    			filename := hex.EncodeToString(h.Sum(nil)) + ".jpg"
    			downloadImg(imgUrl, "./"+id+"/"+title+"/"+filename, baseUrl)
    
    		})
    
    	})
    
    	// page two
    	baseUrl2 := url + "/p2/"
    	req2, err := http.NewRequest("GET", baseUrl2, nil)
    	req2.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36")
    	req2.Header.Add("Referer", baseUrl)
    	req2.Header.Add("Cookie", "__order_time__=undefined; msg_time=undefined; back_order_time=undefined; complain_time=undefined; __usersign__=1570614910876417305; _ga=GA1.2.2061581476.1570614904; _gid=GA1.2.1666843180.1570614904; PHPSESSID=5001a7796cc83a8255b33284a3a30dd7; cityId=1; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1570614904,1570693381; __p_t__=15706935958294; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bs%3A3%3A%22207%22%3Bs%3A8%3A%22cityName%22%3Bs%3A9%3A%22%E6%B3%89%E5%B7%9E%E5%B8%82%22%3Bs%3A8%3A%22cityCode%22%3Bs%3A3%3A%22134%22%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1570694344")
    	res2, err := client.Do(req2)
    	if err != nil {
    		log.Fatal(err)
    	}
    	defer res2.Body.Close()
    
    	doc2, err := goquery.NewDocumentFromResponse(res2)
    	if err != nil {
    		log.Fatal(err)
    	}
    
    	//获取术后文字和图片链接 page two
    	doc2.Find(".diary-item").Each(func(i int, s *goquery.Selection) {
    		num := s.Length()
    		if num > 0 {
    			title := s.Find(".day").Text()
    			os.Mkdir("./"+id+"/"+title, os.ModePerm)
    
    			s.Find(".photo-list li img").Each(func(k int, s2 *goquery.Selection) {
    
    				imgUrl, _ := s2.Attr("data-img")
    				imgUrl = strings.Replace(imgUrl, "_301_301", "", -1)
    				imgUrl = strings.Replace(imgUrl, "face/", "", -1)
    
    				h := md5.New()
    				h.Write([]byte(imgUrl))
    				filename := hex.EncodeToString(h.Sum(nil)) + ".jpg"
    				downloadImg(imgUrl, "./"+id+"/"+title+"/"+filename, baseUrl)
    
    			})
    		}
    
    	})
    	//对应ID
    	info := make(map[string]string)
    	info["ID"] = GbkToUtf8(id)
    	info["picUrl"] = "success"
    	return info
    
    }
    
    //http://127.0.0.1:1024/?id=dpg8426968
    func main() {
    
    	runtime.GOMAXPROCS(runtime.NumCPU())
    	http.HandleFunc("/", indexHandler)
    	http.ListenAndServe(":1024", nil)
    
    }
    
    func indexHandler(w http.ResponseWriter, r *http.Request) {
    
    	defer r.Body.Close()
    	r.ParseForm()
    	id := r.FormValue("id")
    	var url string
    
    	url = "https://www.soyoung.com/" + id
    	info := GetJokes(url, id)
    	s, _ := json.Marshal(info)
    	w.Write(s)
    
    }
    

      

  • 相关阅读:
    re模块 与正则表达式之间的关系 一.....
    计算机硬件组成
    随便
    linux crontab定时
    linux shell命令记录
    tomcat意外退出 A valid shutdown command was received via the shutdown port. Stopping the Server instance
    mongodb设置用户
    mongodb的mongo.conf文件 启动参数
    mysql设置不区分表名大小写
    mongodb报Write failed with error code 17280 and error message 'WiredTigerIndex::insert: key too large
  • 原文地址:https://www.cnblogs.com/hcjs/p/11653051.html
Copyright © 2011-2022 走看看