zoukankan      html  css  js  c++  java
  • webmagic学习之路-2:采集安居客经纪人列表

    相比较 1  稍微成熟了一点,会用的东西多了。
    正则用的不好,很多东西不会,大神轻喷!



    package com.action;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import javax.management.JMException;
    import javax.swing.plaf.synth.SynthSpinnerUI;
    
    import org.bson.Document;
    
    import com.model.AgentListModel;
    import com.model.Model_AnjukeList;
    import com.mongodb.BasicDBObject;
    import com.util.Constants;
    import com.util.GetDate;
    import com.util.MysqlUtils;
    import com.util.MD5With32;
    import com.util.MongoDBUtil;
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.ResultItems;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.monitor.SpiderMonitor;
    import us.codecraft.webmagic.pipeline.ConsolePipeline;
    import us.codecraft.webmagic.processor.PageProcessor;
    import us.codecraft.webmagic.selector.Html;
    import us.codecraft.webmagic.selector.Selectable;
    
    public class GetAnjukeAgentList implements PageProcessor {
    
        static AgentListModel anjukeList;
        static List<String> list = new ArrayList<String>();
        static List<AgentListModel> list_insert = new ArrayList<AgentListModel>();
        static BasicDBObject  doc = null;
        private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8")
                .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
        @Override
        public Site getSite() {
            // TODO Auto-generated method stub
            return this.site;
        }
        
        @Override
        public void process(Page page) {
            System.out.println("code:"+page.getStatusCode());
            System.out.println(page.getUrl());
            if(!page.getUrl().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+-q-[a-z]+/").match()/*&&!page.getHtml().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+-q-[a-z]+/p[0-9]+/").match()*/){
                page.addTargetRequests(page.getHtml().xpath("//span[@class='elems-l']/a/@href").regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+/").all());
                if(page.getUrl().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+/").match()){
                    page.addTargetRequests(page.getHtml().xpath("//div[@class='sub-items']/a/@href").all());
                }
                
            }else{
                //3行可以移动到这里!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                //分割线--------------------
                List<Selectable> htmls = page.getHtml().xpath("//div[@class='jjr-info']").nodes();
                for(Selectable html: htmls){
                    String name = html.xpath("//div/h3/a/text()").get();
                    String staffNo = html.xpath("//div/h3/a/@href").get();
                    String company = html.xpath("//p[@class='jjr-desc']/a[1]/text()").get();
                    String company_url = html.xpath("//p[@class='jjr-desc']/a[1]/@href").get();
                    String store = html.xpath("//p[@class='jjr-desc']/a[2]/text()").get();
                    String store_url = html.xpath("//p[@class='jjr-desc']/a[2]/@href").get();
                    anjukeList = new AgentListModel("", "", "", "", "anjuke", GetDate.getDay0(), page.getUrl()+"", name, staffNo, company, company_url, store, store_url);
                    list_insert.add(anjukeList);
                }
                String city = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", "");
                String zone = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[3]/text()").get().replace("经纪人", "");
                String street = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[4]/text()").get().replace("经纪人", "");
                MysqlUtils.InsertAnjukeAgent(list_insert,city,zone,street);
                list_insert.clear();
                
                if(page.getHtml().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+-q-[a-z]+/p[0-9]+/").match()){
                    //获取分页
                    page.addTargetRequests(page.getHtml().xpath("//div[@class='page-content']/div/a/@href").all());
                }
            }
        }
        
    
        public static void main(String[] args) {
            List<String> list = new ArrayList<String>();
            list.add("https://chongqing.anjuke.com/tycoon/");for (int i = 0; i < list.size(); i++) {
                Spider.create(new GetAnjukeAgentList())
                .addUrl(list.get(i))
                .addPipeline(new ConsolePipeline())
                .thread(20)
                .run();
            }
            
            
        }
    }

    这段代码有个很大的疑问,不知道有没有大神给解释一下。

    String city = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", "");
    String zone = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[3]/text()").get().replace("经纪人", "");
    String street = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[4]/text()").get().replace("经纪人", "");

    上面这3行,如果你把位置移动一下。

    移动到分割线上面去。

    这3个xpath会匹配不到内容,我研究了很长时间,没搞明白,也就没再研究下去了。

    评论区留言告知下,谢谢!!

     
  • 相关阅读:
    【转】构建高并发高可用的电商平台架构实践
    【转】深入解析浏览器的幕后工作原理
    【转】解释器,树遍历解释器,基于栈与基于寄存器
    EasyDarwin返回401 Unauthorized解决方法
    【转】SDP file
    音频PCM格式
    H264相关知识
    testNG之异常测试
    testNG之组测试
    testNG之顺序执行
  • 原文地址:https://www.cnblogs.com/tnsay/p/10895325.html
Copyright © 2011-2022 走看看