zoukankan      html  css  js  c++  java
  • Colly provides a clean interface to write any kind of crawler/scraper/spider

    Scraping Framework for Golang http://go-colly.org/

    https://github.com/gocolly/colly

    package main

    import (
    "fmt"

    "github.com/gocolly/colly"
    "time"
    "regexp"
    "strings"
    )

    /*
    task
    http://www.cnhan.com/hyzx/
    http://www.cnhan.com/shantui/
    http://www.cnhan.com/pinfo/

    http://www.heze.cn/info
    http://www.heze.cn/qiye/

    采集站点当日更新数据的客户联系方式

    */
    func getTodayUrls() []string {
    var todayUrls []string
    // Instantiate default collector
    c := colly.NewCollector(
    colly.AllowedDomains("www.cnhan.com"),
    )
    // On every a element which has href attribute call callback
    // 类选择器
    //url仅在本页
    c.OnHTML(".showSort a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    todayUrls = append(todayUrls, link)
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    })

    // Start scraping on http://www.cnhan.com/shantui/
    c.Visit("http://www.cnhan.com/shantui/")

    //起始路由改变
    // Instantiate default collector
    c = colly.NewCollector(
    colly.AllowedDomains("www.cnhan.com"),
    colly.URLFilters(
    //请求页面的正则表达式,满足其一即可
    //http://www.cnhan.com/hyzx/
    //http://www.cnhan.com/hyzx/index-all-2.html
    //硬代码:当天最多更新99页http://www.cnhan.com/hyzx/index-all-99.html
    //^[1-9][0-9]{0,1}[^0-9]{0,1}$
    regexp.MustCompile("^http://www.cnhan.com/hyzx/(.{0}$)|(index-all-[1-9][0-9]{0,1}[^0-9]{0,1}\.html$)"),
    ),
    )
    // On every a element which has href attribute call callback
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    c.Visit(e.Request.AbsoluteURL(link))
    datetime := time.Now().Format("20060102")
    fmt.Println(datetime)
    reg := regexp.MustCompile(datetime) // http://www.cnhan.com/hyzx/20180827/7109076.html 通过url格式过滤出今天的url
    data := reg.Find([]byte(link))
    regRes := len(data)
    if regRes > 0 {
    link = "http://www.cnhan.com/hyzx/" + link
    todayUrls = append(todayUrls, link)
    }
    })

    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL.String())
    })

    // Start scraping on http://www.cnhan.com/shantui/
    c.Visit("http://www.cnhan.com/hyzx/")

    //起始路由改变
    // Instantiate default collector
    c = colly.NewCollector(
    colly.AllowedDomains("www.cnhan.com"),
    colly.URLFilters(
    //请求页面的正则表达式,满足其一即可
    //http://www.cnhan.com/pinfo/
    //http://www.cnhan.com/pinfo/index-5.html
    //硬代码:当天最多更新99页http://www.cnhan.com/pinfo/index-99.html
    regexp.MustCompile("^http://www.cnhan.com/pinfo/(.{0}$)|(index-[1-9][0-9]{0,1}[^0-9]{0,1}\.html$)"),
    ),
    )
    // On every a element which has href attribute call callback
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    c.Visit(e.Request.AbsoluteURL(link))
    //文本过滤
    eDate := e.ChildText(".span2")
    //http://www.cnhan.com/pinfo/313257.html 周口水泥彩砖具有的特色是什么2018.08.27
    datetime := time.Now().Format("2006.01.02")
    if (strings.Contains(eDate, datetime)) {
    link := e.Attr("href")
    link = "http://www.cnhan.com" + link
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    todayUrls = append(todayUrls, link)
    }
    })

    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL.String())
    })

    // Start scraping on http://www.cnhan.com/shantui/
    c.Visit("http://www.cnhan.com/pinfo/")

    //起始路由改变
    // Instantiate default collector
    c = colly.NewCollector(
    colly.AllowedDomains("www.heze.cn"),
    )
    // On every a element which has href attribute call callback
    // 类选择器
    c.OnHTML(".news_list_r a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    todayUrls = append(todayUrls, link)
    })

    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL.String())
    })

    // Start scraping on http://www.cnhan.com/shantui/
    c.Visit("http://www.heze.cn/info/")

    /*
    站内目标url
    http://www.heze.cn/info/
    http://www.heze.cn/qiye/
    检测思路:
    1、按父url,分别进入 http://www.heze.cn/qiye/18240670888/show-37-1367148.html http://www.heze.cn/info/LEbigong/show-1-13931879.html
    与2反
    2、按照全站进入
    优点:过滤规则简单,代码代码简单;爬取结果数据不便于分类处理,比如产品类型、发布时间;
    缺点:爬爬取速度慢
    */

    //起始路由改变
    //http://www.heze.cn/qiye/ 该页面、其主体子页面,刷新,内容变化
    //http://www.heze.cn/qiye/list-8.html
    // Instantiate default collector
    c = colly.NewCollector(
    colly.AllowedDomains("www.heze.cn"),
    colly.URLFilters(
    //请求页面的正则表达式,满足其一即可
    regexp.MustCompile("^http://www.heze.cn/qiye/(.{0}$)|(list-\d+-\d+\.html$)"),
    ),
    )
    // On every a element which has href attribute call callback
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    c.Visit(e.Request.AbsoluteURL(link))
    // http://www.heze.cn/qiye/hongfei688/show-44-14825619.html
    reg := regexp.MustCompile("^http://www.heze.cn/qiye/[0-9a-zA-Z]+/show-\d+-\d+\.html$")
    data := reg.Find([]byte(link))
    regRes := len(data)
    if regRes > 0 {
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    todayUrls = append(todayUrls, link)
    }
    })

    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL.String())
    })

    // Start scraping on http://www.heze.cn/qiye/
    c.Visit("http://www.heze.cn/qiye/")

    return todayUrls
    }

    func main() {
    var todayUrls = getTodayUrls()
    fmt.Println(todayUrls)
    fmt.Println(len(todayUrls))
    }




  • 相关阅读:
    Asp.net2.0 中自定义过滤器对Response内容进行处理 dodo
    自动化测试工具 dodo
    TestDriven.NET 2.0——单元测试的好助手(转) dodo
    JS弹出窗口的运用与技巧 dodo
    ElasticSearch 简介 规格严格
    修改PostgreSQL字段长度导致cached plan must not change result type错误 规格严格
    Linux系统更改时区(转) 规格严格
    mvn编译“Cannot find matching toolchain definitions for the following toolchain types“报错解决方法 规格严格
    ElasticSearch 集群 & 数据备份 & 优化 规格严格
    Elasticsearch黑鸟教程22:索引模板的详细介绍 规格严格
  • 原文地址:https://www.cnblogs.com/rsapaper/p/9525009.html
Copyright © 2011-2022 走看看