zoukankan      html  css  js  c++  java
  • webmagic学习之路-3:采集安居客经纪人详情页

    这里希望安居客的同行的轻喷!!单纯的做测试,玩玩。

    就这么糟践你们的服务器了!!!sorry!

    这次学会了webmagic 设置处理的访问HTML返回代码,因为之前一直404的页面process根本都不会进来,纳闷很久,也百度了半天。

    看源码看了好半天,才知道原来有这个方法设置进process的状态码,让我看源码的决心来源于 我用logger 打印的内容告诉我,webmagic已经获取了404,只是没处理而已。

    也同时学会了 scheduler

    package com.action;
    
    import java.util.ArrayList;
    import java.util.HashSet;
    import java.util.List;
    import java.util.Set;
    import java.util.TreeSet;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import javax.management.JMException;
    import javax.swing.plaf.synth.SynthSpinnerUI;
    
    import org.apache.commons.collections.bag.SynchronizedSortedBag;
    import org.apache.log4j.Logger;
    import org.bson.Document;
    
    import com.model.AgentListByNumModel;
    import com.model.AgentListModel;
    import com.model.Model_AnjukeList;
    import com.mongodb.BasicDBObject;
    import com.util.Constants;
    import com.util.GetDate;
    import com.util.MysqlUtils;
    import com.util.MD5With32;
    import com.util.MongoDBUtil;
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Request;
    import us.codecraft.webmagic.ResultItems;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.downloader.AbstractDownloader;
    import us.codecraft.webmagic.downloader.Downloader;
    import us.codecraft.webmagic.monitor.SpiderMonitor;
    import us.codecraft.webmagic.pipeline.ConsolePipeline;
    import us.codecraft.webmagic.processor.PageProcessor;
    import us.codecraft.webmagic.scheduler.PriorityScheduler;
    import us.codecraft.webmagic.selector.Html;
    import us.codecraft.webmagic.selector.Selectable;
    
    public class GetAnjukeAgentByNum implements PageProcessor {
        
        static Logger logger = Logger.getLogger(GetAnjukeAgentByNum.class); 
        static AgentListByNumModel anjukeList;
        static List<String> list = new ArrayList<String>();
        static List<AgentListByNumModel> list_insert = new ArrayList<AgentListByNumModel>();
        static BasicDBObject doc = null;
        static int num = 0;
        private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8").setUserAgent(
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
        @Override
        public Site getSite() {
            // TODO Auto-generated method stub
            Set<Integer> acceptStatCode = new HashSet<Integer>();
            acceptStatCode.add(200);
            acceptStatCode.add(404);
            site = site.setAcceptStatCode(acceptStatCode);
            return this.site;
        }
        @Override
        public void process(Page page) {
            if(page.getStatusCode()==404
                    ||page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", "").equals("")
                    ||(page.getHtml()+"").contains("经纪人店铺暂时关闭")){
                String spider_urls = page.getUrl() +"";
                anjukeList = new AgentListByNumModel("", "", "", "", "anjuke", GetDate.getDay0(),
                        spider_urls, "", spider_urls, "", "", "", "", "", "");
                list_insert.add(anjukeList);
                MysqlUtils.InsertAnjukeAgentByNum(list_insert);
                list_insert.clear();
            }else{
                if((page.getHtml()+"").contains("访问验证-安居客")){
                    num = num+1;
                    System.out.println("被封次数 : "+num);
                }
                String zone = "";
                String street = "";
                String contact = "";
                String city = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", "");
                String name = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[4]/text()").get().replace("的店铺", "");
                String staffNo = page.getUrl() + "";
                String company = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[1]/a/text()").get();
                String company_url = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[1]/a/@href").get();
                String store = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[2]/a/text()").get();
                String store_url = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[2]/a/@href").get();
                String comms = page.getHtml().xpath("//dl[@class='item last']/dd/a/text()").all() + "";
                comms = comms.replace("[", "").replace("]", "");
                String contacts = page.getHtml().xpath("//head/meta[3]/@content").get();
                Pattern p = Pattern.compile(Constants.reg_phone);
                Matcher m = p.matcher(contacts);
                if (m.find()) {
                    contact = m.group(0);
                }
                Object[] zs = page.getHtml().xpath("//div[@class='section service']/dl[3]/dd/a/text()").all().toArray();
                if (zs == null || zs.length == 0) {
                    String zss = page.getHtml().xpath("//div[@class='details-item']/span[@class='comm-address']/@title").get();
                    if (zss!=null&&(zss.contains("[") && zss.contains(" "))) {
                        zss = zss.substring(zss.indexOf("["), zss.indexOf("]"));
                        zss = zss.substring(1, zss.indexOf(" "));
                        if (zss.contains("-")) {
                            zone = zss.split("-")[0];
                            street = zss.split("-")[1];
                        }
                    }
                    anjukeList = new AgentListByNumModel("", city, zone, street, "anjuke", GetDate.getDay0(),
                            page.getUrl() + "", name, staffNo, company, company_url, store, store_url, contact, comms);
                    list_insert.add(anjukeList);
                } else {
                    for (int i = 0; i < zs.length; i++) {
                        if ((zs[i] + "").contains("-")) {
                            String[] zss = zs[i].toString().split("-");
                            zone = zss[0];
                            street = zss[1];
                        }
                        anjukeList = new AgentListByNumModel("", city, zone, street, "anjuke", GetDate.getDay0(),
                                page.getUrl() + "", name, staffNo, company, company_url, store, store_url, contact, comms);
                        list_insert.add(anjukeList);
                    }
                }
                if(list_insert.size()>0){
                    MysqlUtils.InsertAnjukeAgentByNum(list_insert);
                    list_insert.clear();
                }
            }
            
            
        }
    
        public static void main(String[] args) {
            MysqlUtils.SelectSpiderID();
            PriorityScheduler scheduler = new PriorityScheduler();
            Spider spider = Spider.create(new GetAnjukeAgentByNum()).setScheduler(scheduler).addPipeline(new ConsolePipeline());
            for (int n = 0; n < 100000; n++) {
                if(Constants.map_id.containsKey(n+"")){
                    System.out.println("contain : " +n);
                    continue;
                }
                String url = "https://junranfangchan.anjuke.com/gongsi-jjr-" + n + "/";
                scheduler.push(new Request(url), spider);
            }
            System.out.println("total task num :" +scheduler.getTotalRequestsCount(spider));
            spider.thread(25).run();
    //        Spider.create(new GetAnjukeAgentByNum()).addUrl("https://junranfangchan.anjuke.com/gongsi-jjr-99988/")
    //        .addPipeline(new ConsolePipeline()).thread(1).run();
    //        
        }
    }
  • 相关阅读:
    如何使用ps技术批量操作图片???
    [2020.8.3]联想 K10(K10e70) Magisk ROOT 纯净无推广 一键刷机 K10e70_S206_170105
    [2020.8.3]联想 K6畅享版(L38082) Magisk ROOT 纯净无推广 一键刷机 ZUI_3.9.226
    [2020.8.3]联想 K5S(L38031) Magisk ROOT 纯净无推广 一键刷机 ZUI_3.9.188
    [2020.8.3]联想 K5(K350t) Magisk ROOT 纯净无推广 一键刷机 ZUI_3.1.244
    [2020.8.3]联想 K5 Pro(L38041) Magisk ROOT 纯净无推广 一键刷机 ZUI_11.1.099
    [2020.8.3]联想 K5 Pro(L38041) Magisk ROOT 纯净无推广 一键刷机 ZUI_5.0.188
    [2020.8.3]联想 K5 Play(L38011) Magisk ROOT 纯净无推广 一键刷机 ZUI_3.7.087
    [2020.8.3]联想 K5 Note(L38012) Magisk ROOT 纯净无推广 一键刷机 ZUI_3.9.269
    --- 测试
  • 原文地址:https://www.cnblogs.com/tnsay/p/10895366.html
Copyright © 2011-2022 走看看