zoukankan      html  css  js  c++  java
  • [日常] Go语言*--并发的web爬虫

    两种:

    crawler.go 

    package main
    
    import (
            "fmt"
            "links"
            //"log"
            "os"
    )
    
    func main() {
            worklist := make(chan []string)
    
            // Start with the command-line arguments.
            go func() { worklist <- os.Args[1:] }() 
            // Crawl the web concurrently.
            seen := make(map[string]bool)
            for list := range worklist {
                    for _, link := range list {
                            if !seen[link] {
                                    seen[link] = true
                                    go func(link string) {
                                            worklist <- crawl(link)
                                    }(link)
                            }   
                    }   
            }   
    }
    
    var tokens = make(chan struct{}, 20) 
    
    //从一个url页面中提取出所有的url
    func crawl(url string) []string {
            fmt.Println(url)
            tokens <- struct{}{}
            list, err := links.Extract(url)
            <-tokens
            if err != nil {
                    //log.Print(err)
            }   
            return list
    }
    

    crawler2.go 

    package main
    
    import (
            "fmt"
            "links"
            //"log"
            "os"
            "strings"
    )
    
    func main() {
            worklist := make(chan []string)
            unseenLinks := make(chan string)
    
            // Start with the command-line arguments.
            go func() { worklist <- os.Args[1:] }() 
            // Create 20 crawler goroutines to fetch each unseen link.
            for i := 0; i < 20; i++ {
                    go func() {
                            for link := range unseenLinks {
                                    //if strings.HasPrefix(link, "http://www.lypeng.com") {
                                    foundLinks := crawl(link)
                                    go func() { worklist <- foundLinks }() 
    
                                    //} 
                            }   
                    }() 
            }   
    
            // The main goroutine de-duplicates worklist items
            // and sends the unseen ones to the crawlers.
            seen := make(map[string]bool)
            for list := range worklist {
                    for _, link := range list {
                            if !seen[link] {
                                    seen[link] = true
                                    unseenLinks <- link
                            }   
                    }   
            }   
    }
    
    //从一个url页面中提取出所有的url
    func crawl(url string) []string {
            fmt.Println(url)
            list, err := links.Extract(url)
            if err != nil {
                    //log.Print(err)
            }   
            return list
    }
    

      

      

  • 相关阅读:
    将同一个应用程序同时作为 http 和 https
    将数组元素划分为等长的块(二维数组)
    将数组中的空元素转为 undefined
    将某个类型断言为另一个与之毫无关系的类型
    将前端代码放入 Egg 项目中
    将根组件挂载到 DOM 节点上
    将类数组对象转换成数组
    将 ts 代码转成 js 代码
    将代码推迟到系统资源空闲时执行
    React 将 state 传给子组件用
  • 原文地址:https://www.cnblogs.com/taoshihan/p/8994816.html
Copyright © 2011-2022 走看看