zoukankan      html  css  js  c++  java
  • 1、找出url汇总页,过滤出满足条件的详情页url;2、去详情页采集信息

    1、找出url汇总页,过滤出满足条件的详情页url;2、去详情页采集信息

    package main

    import (
    "fmt"

    "github.com/gocolly/colly"
    "regexp"
    "strings"

    "github.com/mongodb/mongo-go-driver/mongo"
    "github.com/mongodb/mongo-go-driver/bson"

    "context"
    "log"
    "gopkg.in/mgo.v2"
    mgoBson "gopkg.in/mgo.v2/bson"
    "time"
    "math/rand"
    )

    /*
    task
    http://www.cnhan.com/hyzx/
    http://www.cnhan.com/shantui/
    http://www.cnhan.com/pinfo/
    http://www.heze.cn/info
    http://www.heze.cn/qiye/
    http://cn.sonhoo.com/wukong/
    采集站点当日更新数据的客户联系方式

    20180828
    站点日期字符串特征
    "cnhan.com/hyzx/": "20180902",
    "cnhan.com/shantui/": "20180902",
    "cnhan.com/pinfo/": "2018.09.02",
    "heze.cn/info/": "2018.09.02",
    "heze.cn/qiye/": "2018.09.02",
    "sonhoo.com/wukong/": "2018-09-02",

    20180904
    http://cn.sonhoo.com/wukong/c16?offset=600&limit=50 先去文章含有文章日期的列表页遍历出符合条件的文章url,
    再去文章详情页http://cn.sonhoo.com/wukong/a213383采集客户资料

    */

    type PotentialCustomerWebSiteUrl struct {
    Url string `bson:"url"`
    }

    // Configuration | Colly http://go-colly.org/docs/introduction/configuration/
    const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

    func RandomString() string {
    b := make([]byte, rand.Intn(10)+10)
    for i := range b {
    b[i] = letterBytes[rand.Intn(len(letterBytes))]
    }
    return string(b)
    }

    // url汇总页的日期筛选方法
    func eggSitePathListTargetDate(TargetDate string) map[string]string {
    SitePathListTargetDateFmt := map[string]string{
    "cnhan.com/hyzx/": "",
    "cnhan.com/shantui/": "ONLYtoday",
    "cnhan.com/pinfo/": ".",
    "heze.cn/info/": ".",
    "heze.cn/qiye/": ".",
    "sonhoo.com/wukong/": "-",
    }
    for k, v := range SitePathListTargetDateFmt {
    fmt.Println(k)
    if ( v == "") {
    SitePathListTargetDateFmt[k] = TargetDate
    } else {
    SitePathListTargetDateFmt[k] = TargetDate[0:4] + v + TargetDate[4:6] + v + TargetDate[6:8]
    }
    }
    fmt.Println(SitePathListTargetDateFmt)
    return SitePathListTargetDateFmt
    }

    // 指定日期数据采集
    var TargetDate = "20180904"
    var TodayDate = time.Now().Format("20060102")
    var mongoCollectioName = "todayUrls0904TestData"
    var SitePathListTargetDate = eggSitePathListTargetDate(TargetDate)

    func getTargetDateSpideredUrl() []string {
    //查询mongodb数据
    session, err := mgo.Dial("mongodb://hbaseU:123@192.168.3.103:27017/hbase")
    if err != nil {
    panic(err)
    }
    defer session.Close()
    // Optional. Switch the session to a monotonic behavior.
    session.SetMode(mgo.Monotonic, true)
    Collection := session.DB("hbase").C(mongoCollectioName)
    var PotentialCustomerWebSiteUrls [] PotentialCustomerWebSiteUrl
    err = Collection.Find(mgoBson.M{"spiderDate": TargetDate}).All(&PotentialCustomerWebSiteUrls)
    if err != nil {
    log.Fatal(err)
    }
    var PotentialCustomerWebSiteUrlSet [] string
    for i, v := range PotentialCustomerWebSiteUrls {
    fmt.Println(i)
    fmt.Println(v.Url)
    PotentialCustomerWebSiteUrlSet = append(PotentialCustomerWebSiteUrlSet, v.Url)
    }
    return PotentialCustomerWebSiteUrlSet
    }

    // 检查元素是否存在于数组?遍历?如何集合运算方法
    func eleInArr(ele string, arr [] string) bool {
    for _, v := range arr {
    if (ele == v) {
    fmt.Println("eleInArr", ele)
    return true
    }
    }
    return false
    }

    func getTargetDateUrls() []string {
    var targetDateUrls []string
    PotentialCustomerWebSiteUrlSet := getTargetDateSpideredUrl()
    c := colly.NewCollector()
    if (TargetDate == TodayDate) {
    // 路径下只有当日url
    // Instantiate default collector
    c = colly.NewCollector(
    colly.AllowedDomains("www.cnhan.com"),
    )
    // On every a element which has href attribute call callback
    // 类选择器
    //url仅在本页
    c.OnHTML(".showSort a[href]", func(e *colly.HTMLElement) {

    link := e.Attr("href")
    t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
    if (!t) {
    targetDateUrls = append(targetDateUrls, link)
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    }
    })
    // Start scraping on http://www.cnhan.com/shantui/
    c.Visit("http://www.cnhan.com/shantui/")

    // 起始路由改变
    // Instantiate default collector
    c = colly.NewCollector(
    colly.AllowedDomains("www.cnhan.com"),
    colly.URLFilters(
    //请求页面的正则表达式,满足其一即可
    //http://www.cnhan.com/hyzx/
    //http://www.cnhan.com/hyzx/index-all-2.html
    //硬代码:目标日最多更新99页http://www.cnhan.com/hyzx/index-all-99.html
    //^[1-9][0-9]{0,1}[^0-9]{0,1}$
    regexp.MustCompile("^http://www.cnhan.com/hyzx/(.{0}$)|(index-all-[1-9][0-9]{0,1}[^0-9]{0,1}\.html$)"),
    ),
    )

    // On every a element which has href attribute call callback
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    c.Visit(e.Request.AbsoluteURL(link))
    d := SitePathListTargetDate["cnhan.com/hyzx/"]
    reg := regexp.MustCompile(d) // http://www.cnhan.com/hyzx/20180827/7109076.html 通过url格式过滤出目标日的url
    data := reg.Find([]byte(link))
    regRes := len(data)
    if regRes > 0 {
    link = "http://www.cnhan.com/hyzx/" + link
    t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
    if (!t) {
    targetDateUrls = append(targetDateUrls, link)
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    }
    }
    })
    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL.String())
    })
    // Start scraping on http://www.cnhan.com/shantui/
    c.Visit("http://www.cnhan.com/hyzx/")

    }

    // 起始路由改变
    // Instantiate default collector
    c = colly.NewCollector(
    colly.AllowedDomains("www.cnhan.com"),
    colly.URLFilters(
    //请求页面的正则表达式,满足其一即可
    //http://www.cnhan.com/pinfo/
    //http://www.cnhan.com/pinfo/index-5.html
    //硬代码:目标日最多更新99页http://www.cnhan.com/pinfo/index-99.html
    regexp.MustCompile("^http://www.cnhan.com/pinfo/(.{0}$)|(index-[1-9][0-9]{0,1}[^0-9]{0,1}\.html$)"),
    ),
    )
    // On every a element which has href attribute call callback
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    c.Visit(e.Request.AbsoluteURL(link))
    //文本过滤
    eDate := e.ChildText(".span2")
    //http://www.cnhan.com/pinfo/313257.html 周口水泥彩砖具有的特色是什么2018.08.27
    d := SitePathListTargetDate["cnhan.com/pinfo/"]
    if (strings.Contains(eDate, d)) {
    link := e.Attr("href")
    link = "http://www.cnhan.com" + link
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
    if (!t) {
    targetDateUrls = append(targetDateUrls, link)
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    }
    }
    })
    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL.String())
    })
    // Start scraping on http://www.cnhan.com/shantui/
    c.Visit("http://www.cnhan.com/pinfo/")

    // 起始路由改变
    // Instantiate default collector
    c = colly.NewCollector(
    colly.AllowedDomains("www.heze.cn"),
    )
    // On every a element which has href attribute call callback
    // 类选择器
    c.OnHTML(".news_list_r a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    targetDateUrls = append(targetDateUrls, link)
    })
    // Start scraping on http://www.cnhan.com/shantui/
    c.Visit("http://www.heze.cn/info/")

    /*
    站内目标url
    http://www.heze.cn/info/
    http://www.heze.cn/qiye/
    检测思路:
    1、按父url,分别进入 http://www.heze.cn/qiye/18240670888/show-37-1367148.html http://www.heze.cn/info/LEbigong/show-1-13931879.html
    与2反
    2、按照全站进入
    优点:过滤规则简单,代码代码简单;爬取结果数据不便于分类处理,比如产品类型、发布时间;
    缺点:爬爬取速度慢
    */

    // 起始路由改变
    //http://www.heze.cn/qiye/ 该页面、其主体子页面,刷新,内容变化
    //http://www.heze.cn/qiye/list-8.html
    // Instantiate default collector
    c = colly.NewCollector(
    colly.AllowedDomains("www.heze.cn"),
    colly.URLFilters(
    //请求页面的正则表达式,满足其一即可
    regexp.MustCompile("^http://www.heze.cn/qiye/(.{0}$)|(list-\d+-\d+\.html$)"),
    ),
    )
    // On every a element which has href attribute call callback
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    c.Visit(e.Request.AbsoluteURL(link))
    // http://www.heze.cn/qiye/hongfei688/show-44-14825619.html
    reg := regexp.MustCompile("^http://www.heze.cn/qiye/[0-9a-zA-Z]+/show-\d+-\d+\.html$")
    data := reg.Find([]byte(link))
    regRes := len(data)
    if regRes > 0 {
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
    if (!t) {
    targetDateUrls = append(targetDateUrls, link)
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    }
    }
    })
    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL.String())
    })
    // Start scraping on http://www.heze.cn/qiye/
    c.Visit("http://www.heze.cn/qiye/")

    // 起始路由改变
    // 目标日页面部分存在于起始页且与飞目标日页面混杂,无法通过选择器判断,需解析文本
    // 全站过滤
    // 类目页 http://cn.sonhoo.com/wukong/c133
    // 文章页 http://cn.sonhoo.com/wukong/a191114
    c = colly.NewCollector(
    colly.AllowedDomains("cn.sonhoo.com"),
    colly.URLFilters(
    //请求页面的正则表达式,满足其一即可
    regexp.MustCompile("^http://cn.sonhoo.com/wukong/$"),
    //regexp.MustCompile("^http://cn.sonhoo.com/wukong/[ac]{1}\d+$"),
    regexp.MustCompile("^http://cn.sonhoo.com/wukong/[c]{1}\d+$"),
    // http://cn.sonhoo.com/wukong/c0?offset=150&limit=50 文章列表页
    regexp.MustCompile("^http://cn.sonhoo.com/wukong/c\d+\?offset=\d+\&limit=\d+$"),
    ),
    // 不加UA,无数据
    // colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"),
    )
    // 限制线程数,引入随机延迟
    // Limit the number of threads started by colly to two
    // when visiting links which domains' matches "*httpbin.*" glob
    c.Limit(&colly.LimitRule{
    DomainGlob: "*sonhoo.*",
    Parallelism: 5,
    RandomDelay: 5 * time.Second,
    })

    // 保证遍历http://cn.sonhoo.com/wukong/c4?offset=100&limit=50各个页面的,获取可能的目标日的url
    // On every a element which has href attribute call callback
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    c.Visit(e.Request.AbsoluteURL(link))
    })

    // 目标日的url入库
    // div[class=page-articles__articles]>ul>li
    c.OnHTML("div[class=page-articles__articles]>ul>li", func(e *colly.HTMLElement) {
    link := e.ChildAttr("a", "href")
    dat1 := e.ChildText("span")
    fmt.Println(dat1)
    fmt.Println(link)
    pageDate := e.ChildText("span")
    d := SitePathListTargetDate["sonhoo.com/wukong/"]
    fmt.Println("tDate", d)
    fmt.Println("pageDate", pageDate)
    if (strings.Contains(pageDate, d)) {
    link = "http://cn.sonhoo.com" + link
    t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
    if (!t) {
    targetDateUrls = append(targetDateUrls, link)
    fmt.Printf("Link found: %q -> %s ", e.Text, link)
    }
    }
    c.Visit(e.Request.AbsoluteURL(link))
    })

    c.OnScraped(func(r *colly.Response) {
    fmt.Println("Finished", r.Request.URL)
    })
    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL.String())
    r.Headers.Set("User-Agent", RandomString())
    })
    c.OnError(func(_ *colly.Response, err error) {
    log.Println("Something went wrong:", err)
    })
    c.OnResponse(func(r *colly.Response) {
    fmt.Println("Visited", r.Request.URL)
    })
    // Start scraping on
    // http://cn.sonhoo.com/wukong/ 不带日期文章列表页
    // http://cn.sonhoo.com/wukong/c0 20180904 带日期的文章列表页:共50页,没有l类目
    c.Visit("http://cn.sonhoo.com/wukong/c0")
    // 等待线程结束
    // Wait until threads are finished
    c.Wait()

    return targetDateUrls
    }

    func main() {
    var targetDateUrls = getTargetDateUrls()
    fmt.Println(targetDateUrls)
    fmt.Println(len(targetDateUrls))
    for _, targetDateUrl := range targetDateUrls {
    // Instantiate default collector
    c := colly.NewCollector()
    // On every a element which has href attribute call callback
    c.OnScraped(func(r *colly.Response) {
    reqUrl := fmt.Sprintln(r.Request.URL)
    strings.Replace(reqUrl, " ", "", -1)
    wholePageHtml := string(r.Body)
    client, err := mongo.Connect(context.Background(), "mongodb://hbaseU:123@192.168.3.103:27017/hbase", nil)
    db := client.Database("hbase")
    coll := db.Collection(mongoCollectioName)
    if err != nil {
    fmt.Println(err)
    }
    //目标日多次采集,目标日同站点且同路径url不重复入库
    //存入时间戳,分析目标站点的信息更新规律
    result, err := coll.InsertOne(
    context.Background(),
    bson.NewDocument(
    bson.EC.String("spiderDate", TargetDate),
    bson.EC.String("url", reqUrl),
    bson.EC.String("html", wholePageHtml),
    ))
    fmt.Println(err)
    fmt.Println(result)
    })
    c.Visit(targetDateUrl)
    }
    }


  • 相关阅读:
    MySql错误解决方案汇总
    不适合做管理的人zz
    linux 自动执行 crontab学习笔记
    Google Megastore分布式存储技术全揭秘zz
    【算法】n个人围成一圈报数,报到3的退出,下面接着从1开始报,问最后剩下的是谁?
    大数据下的数据分析平台架构zz
    ETL的可扩展性和可维护性zz
    【算法】各种排序算法测试代码
    谈爱情故事,谈观察者模式
    解读设计模式单例模式(Singleton Pattern)
  • 原文地址:https://www.cnblogs.com/rsapaper/p/9583560.html
Copyright © 2011-2022 走看看