zoukankan      html  css  js  c++  java
  • go爬虫

    package main
    
    import (
    	"fmt"
    	"github.com/antchfx/htmlquery"
    	"io/ioutil"
    	"net/http"
    	"os"
    	"regexp"
    	"strings"
    	"sync"
    	"time"
    )
    var wg sync.WaitGroup
    func main() {
    
    	var url string = "https://haomooc.com/xiaoxue-read-2991.html"
    
    	resp, _ := http.Get(url)
    
    	defer resp.Body.Close()
    
    	doc, _ := htmlquery.Parse(resp.Body)
    	list := htmlquery.Find(doc, "//div[@class='dxs-l-b']//a")
    	for _, li := range list {
    		href := htmlquery.SelectAttr(li, "href")
    		strings.Replace(href, " ", "", -1)
    		title := htmlquery.SelectAttr(li,"title")
    		strings.Replace(title, " ", "", -1)
    		fmt.Printf("%s
    ", title)
    		fmt.Printf("%s
    ", href)
    		video := getVideo(href)
    		wg.Add(1)
    		//saveVideo(title,video)
    		fmt.Printf("%s
    ", video)
    	}
    
    	wg.Wait()
    }
    
    func getVideo(url string) string  {
    	time.Sleep(time.Second*1)
    	resp, _ := http.Get(url)
    
    	bytesContent, _ := ioutil.ReadAll(resp.Body)
    
    	var reEmail = `(https://video.haomooc.com/.*.mp4)`
    
    	re := regexp.MustCompile(reEmail)
    	list := re.FindAllStringSubmatch(string(bytesContent), -1)
    
    	var result string
    
    	for _,v := range list {
    		if v != nil{
    			result = v[1]
    		}
    	}
    	strings.Replace(result, " ", "", -1)
    	return result
    }
    func PathExists(path string) (bool, error) {
    
    	_, err := os.Stat(path)
    	if err == nil {
    		return true, nil
    	}
    	if os.IsNotExist(err) {
    		return false, nil
    	}
    	return false, err
    }
    
    func saveVideo(title string ,url string)  {
    
    	fmt.Printf(title,url)
    	path := "/www/shell/video/"+title+".mp4"
    	b, err := PathExists(path)
    	if err != nil {
    		fmt.Printf("PathExists(%s),err(%v)
    ", path, err)
    	}
    	if b {
    		fmt.Printf("path %s 存在
    ", path)
    	} else{
    		fmt.Println("save video "+title )
    		fmt.Printf("%s",url)
    		// Get the data
    		resp, err := http.Get(url)
    		if err != nil {
    			panic(err)
    		}
    		defer resp.Body.Close()
    
    		data, err := ioutil.ReadAll(resp.Body)
    		if err != nil {
    			panic(err)
    		}
    		ioutil.WriteFile(path, data, 0644)
    		defer wg.Done()
    	}
    
    
    }
    
  • 相关阅读:
    mongoDB看这篇就够了
    放不下
    jmeter连接不上MySQL数据库的原因以及解决方法
    SecureCRT自动断开连接的解决方法
    Linux及Windows查看占用端口的进程
    网络基础知识
    selenium中driver.close()和driver.quit()的不同点
    day2_窗口句柄切换
    day6_异常捕捉
    day6_logging模块
  • 原文地址:https://www.cnblogs.com/brady-wang/p/14054939.html
Copyright © 2011-2022 走看看