zoukankan      html  css  js  c++  java
  • GO语言_用redis作为url队列的爬虫

    // Copyright 2016 laosj Author @songtianyi. All Rights Reserved.
    //
    // Licensed under the Apache License, Version 2.0 (the "License");
    // you may not use this file except in compliance with the License.
    // You may obtain a copy of the License at
    //
    //      http://www.apache.org/licenses/LICENSE-2.0
    //
    // Unless required by applicable law or agreed to in writing, software
    // distributed under the License is distributed on an "AS IS" BASIS,
    // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    // See the License for the specific language governing permissions and
    // limitations under the License.
    
    package main
    
    import (
        "github.com/songtianyi/laosj/downloader"
        "github.com/songtianyi/laosj/spider"
        "github.com/songtianyi/rrframework/connector/redis"
        "github.com/songtianyi/rrframework/logs"
        "github.com/songtianyi/rrframework/storage"
        "regexp"
        "strconv"
        "sync"
    )
    
    func main() {
        d := &downloader.Downloader{
            ConcurrencyLimit: 10,
            UrlChannelFactor: 10,
            RedisConnStr:     "127.0.0.1:6379",
            SourceQueue:      "DATA:IMAGE:MZITU:XINGGAN",
            Store:            rrstorage.CreateLocalDiskStorage("/Users/deer_mac/Desktop/自拍/"),
        }
        go func() {
            d.Start()
        }()
    
        // step1: find total pages
        s, err := spider.CreateSpiderFromUrl("http://www.mzitu.com/share")
        if err != nil {
            logs.Error(err)
            return
        }
        rs, _ := s.GetText("div.main>div.main-content>div.postlist>div>div.pagenavi-cm>a")
        max := spider.FindMaxFromSliceString(1, rs)
    
        // step2: for every page, find all img tags
        var wg sync.WaitGroup
        var mu sync.Mutex
        step2 := make([]string, 0)
        for i := 1; i <= max; i++ {
            wg.Add(1)
            go func(ix int) {
                defer wg.Done()
                ns, err := spider.CreateSpiderFromUrl(s.Url + "/comment-page-" + strconv.Itoa(ix) + "#comments/")
                if err != nil {
                    logs.Error(err)
                    return
                }
                t, _ := ns.GetHtml("div.main>div.main-content>div.postlist>div>ul>li>div>p")
                mu.Lock()
                step2 = append(step2, t...)
                mu.Unlock()
            }(i)
        }
        wg.Wait()
        err, rc := rrredis.GetRedisClient(d.RedisConnStr)
        if err != nil {
            logs.Error(err)
            return
        }
        // parse url
        for _, v := range step2 {
            re := regexp.MustCompile("src="(\S+)"")
            url := re.FindStringSubmatch(v)[1]
            key := d.SourceQueue
            if _, err := rc.RPush(key, url); err != nil {
                logs.Error(err)
                return
            }
        }
        d.WaitCloser()
    }

    首先要开启redis服务, 然后就可以了.

  • 相关阅读:
    OpenDaylight二层转发机制实验
    OpenvSwitch的GRE、Vxlan隧道
    Open vSwitch实验
    实验五 RYU控制器基本应用
    实验四 POX控制器编程实验指导
    实验二 OpenFlow应用实践
    SDN实验三 OpenFlow协议分析
    SDN实验一:mininet应用实践
    序列幂次求和的快速计算
    luogu P1409 骰子 题解
  • 原文地址:https://www.cnblogs.com/mafeng/p/6796029.html
Copyright © 2011-2022 走看看