zoukankan      html  css  js  c++  java
  • 如何用golang搜索抓取淘宝商品

    package main
    
    
    import (
        "fmt"
        "log"
        "os"
        "strings"
        "sync"
    
    
        "./php"
        "github.com/tealeg/xlsx"
    )
    
    
    var wg sync.WaitGroup //定义一个同步等待的组
    func main() {
        fileName := "xxx_debug.log"
        logFile, err := os.Create(fileName)
        defer logFile.Close()
        log.SetOutput(logFile)
        arg_num := len(os.Args)
        fmt.Printf("the num of input is %d
    ", arg_num)
    
    
        if arg_num == 1 || !strings.Contains(os.Args[1], ".xlsx") {
            fmt.Println("请输入****.xlsx文件作为参数")
            return
        }
    
    
        fmt.Printf("they are :
    ")
        for i := 0; i < arg_num; i++ {
            fmt.Println(os.Args[i])
        }
    
    
        var (
            excel_file_path string                         = os.Args[1]
            file_result     map[int]map[int]map[int]string = make(map[int]map[int]map[int]string)
            sheet_result    map[int]map[int]string         = make(map[int]map[int]string)
        )
        //打开一个excel文件资源
        f, err := xlsx.OpenFile(excel_file_path)
        if err != nil {
            log.Println(err.Error())
        }
    
    
        //循环文件中所有工作表
        for sheet_key, sheet := range f.Sheets {
            //循环对应工作表中行数
            for key, row := range sheet.Rows {
                row_result := make(map[int]string)
                //循环工作表行数的每一列
                for k, cell := range row.Cells {
                    row_result[k] = cell.Value
                }
                //如果为空不添加对应值到 数组
                if !php.Empty(row_result) {
                    sheet_result[key] = row_result
                }
            }
            //如果为空不添加对应值到 数组
            if !php.Empty(sheet_result) {
                file_result[sheet_key] = sheet_result
            }
    
    
        }
        //输出表格的结果
        for _, sheet := range file_result {
            for k, _ := range sheet {
                if k != 0 || !strings.Contains(sheet[k][1], "商品名称") {
                    log.Printf("%d=%v
    ", k, sheet[k][1])
                    wg.Add(1) //为同步等待组增加一个成员
                    go Spy(sheet[k][1])
                }
            }
        }
        wg.Wait() //阻塞等待所有组内成员都执行完毕退栈
        fmt.Println("WE DONE!!!")
    }




    func Spy(urls string) {
        defer func() {
            wg.Done()
    
    
            if r := recover(); r != nil {
                log.Println("[E]", r)
            }
        }()
        urls = url.QueryEscape(urls)
        urlpath := tburl + urls + tburlpara
    
    
        log.Println(urlpath)
        req, err := http.NewRequest("GET", urlpath, nil)
        if err != nil {
            log.Printf("Get请求%s返回错误:%s", urlpath, err)
            return
        }
        req.Header.Set("User-Agent", GetRandomUserAgent())
        client := http.DefaultClient
        res, e := client.Do(req)
        if e != nil {
            log.Printf("Get请求%s返回错误:%s", urlpath, e)
            return
        }
    
    
        if res.StatusCode == 200 {
            body := res.Body
            defer body.Close()
            bodyByte, _ := ioutil.ReadAll(body)
            resStr := string(bodyByte)
    
    
            ajson := atagRegExp.FindAllString(resStr, -1)
            nlen := len(ajson[0])
            if nlen > 16 {
                jsons := ajson[0][16 : len(ajson[0])-2]
                var v interface{}
                json.Unmarshal([]byte(jsons), &v)
                i := 0
                minprice := 9999999.00
                words, _ := dproxy.New(v).P("/mods/itemlist/data").M("query").String()
                m := make(map[string][]string)
                for {
                    set := dproxy.New(v).P("/mods/itemlist/data/auctions").A(i)
    
    
                    var u = make([]string, 0)
                    sales, err := set.M("view_sales").String()
                    if err != nil {
                        log.Printf("/mods/itemlist/data/auctions path error %v 
    ", err.Error())
                        break
                    }
                    sales = strings.Replace(sales, "人付款", "", 1)
                    price, err := set.M("view_price").String()
                    title, err := set.M("raw_title").String()
                    url, err := set.M("detail_url").String()
    
    
                    inprice, err := strconv.ParseFloat(price, 32)
                    if err != nil {
                        log.Println("转换有错")
                        panic(fmt.Sprintf("%v 转换有错", price))
    
    
                    }
                    insales, err := strconv.Atoi(sales)
                    if err != nil {
                        log.Println("转换有错")
                        panic(fmt.Sprintf("%v 转换有错", price))
    
    
                    }
                    if minprice > inprice && inprice > 1 && insales >= 1 {
                        minprice = inprice
                    }
    
    
                    u = append(u, sales)
                    u = append(u, price)
                    u = append(u, title)
                    u = append(u, url)
    
    
                    fmt.Printf("%v===%v===%v
    ", title, sales, price)
                    log.Printf("%v===%v===%v
    ", title, sales, price)
                    i = i + 1
                    m[url] = u
                }
                fmt.Printf("%v
    ", minprice)
                buildxlsx(words, m, minprice)
            }
    
    
        } else {
            log.Printf("返回网页错误 %v", res.StatusCode)
    
    
        }
    }



    var patherrch = [...]string{"/", "\", ":", "*", "?", """, "<", ">", "|"}
    var userAgent = [...]string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
        "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
        "Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
        "Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
        "Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
        "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
        "Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
        "Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
        "Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}
    
    
    var r = rand.New(rand.NewSource(time.Now().UnixNano()))
    var tburl = "https://s.taobao.com/search?q="
    var tburlpara = "&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201857-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&sort=price-asc"
    var urlChannel = make(chan string, 200)                         //chan中存入string类型的href属性,缓冲200
    var atagRegExp = regexp.MustCompile(`g_page_config = (.*?);
    `) //以Must前缀的方法或函数都是必须保证一定能执行成功的,否则将引发一次panic
    var chineseRegExp = regexp.MustCompile("^[u4e00-u9fa5]$")
    
    
    func GetRandomUserAgent() string {
        return userAgent[r.Intn(len(userAgent))]
    }
    
    



  • 相关阅读:
    hdu 2112 (最短路+map)
    poj 1502 最短路+坑爹题意
    poj 1696 Space Ant (极角排序)
    poj 1410 线段相交判断
    使用本地光盘安装Microsoft .NET Framework 3.5 for Win8.1/WinServer2012R2
    Excel REPT函数使用
    tomcat7配置虚拟目录
    Tomcat 7.0的配置
    js去除空格
    JAVABEAN连接各数据库
  • 原文地址:https://www.cnblogs.com/ipub520/p/ipub520.html
Copyright © 2011-2022 走看看