zoukankan      html  css  js  c++  java
  • webmagic学习之路-1:采集安居客列表页测试

    ---恢复内容开始---

    package com.action;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import javax.management.JMException;
    
    import org.bson.Document;
    
    import com.model.Model_AnjukeList;
    import com.mongodb.BasicDBObject;
    import com.util.Constants;
    import com.util.GetDate;
    import com.util.MD5With32;
    import com.util.MongoDBUtil;
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.monitor.SpiderMonitor;
    import us.codecraft.webmagic.pipeline.ConsolePipeline;
    import us.codecraft.webmagic.processor.PageProcessor;
    
    public class GetAnjukeListNum implements PageProcessor {
    
        public static Model_AnjukeList anjukeList;
        public static List<String> list = new ArrayList<String>();
        public static List<BasicDBObject> list_insert = new ArrayList<BasicDBObject>();
        private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8")
                .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
        @Override
        public Site getSite() {
            // TODO Auto-generated method stub
            return this.site;
        }
        
        @Override
        public void process(Page page) {
            String found = null;
            BasicDBObject  doc = null;
            page.addTargetRequests(Constants.list_urls);
            System.out.println("code:"+page.getStatusCode());
            String pg = page.getHtml().toString();
            if(pg.length()>100){
                Pattern p = Pattern.compile(Constants.anjuke_Reg_Found);
                Matcher m = p.matcher(pg);
                while(m.find()){
                    found = m.group(0).replace(""found":", "").replace(",", "");
                    String id = MD5With32.encryption(page.getUrl().toString());
                    if(!Constants.map_urls.containsKey(id)){
                        continue;
                    }
                    Model_AnjukeList model_AnjukeList = Constants.map_urls.get(id);
    //mongo存储! doc
    = new BasicDBObject("_id",id) .append("city", model_AnjukeList.getCity()) .append("towards", model_AnjukeList.getTowards()) .append("zone_urls", model_AnjukeList.getZone_urls()) .append("zone", model_AnjukeList.getZone()) .append("site", model_AnjukeList.getSite()) .append("decoration", model_AnjukeList.getDecoration()) .append("flag", model_AnjukeList.getFlag()) .append("street", model_AnjukeList.getStreet()) .append("type", model_AnjukeList.getType()) .append("page", model_AnjukeList.getPage()) .append("urls", model_AnjukeList.getUrls()) .append("found", found) .append("update_time", model_AnjukeList.getUpdate_time()) ; list_insert.add(doc); } } } public static void main(String[] args) { String city = "北京"; String urls = "https://beijing.anjuke.com/sale/"; MongoGetUrls.GetMongoUrls(city); System.out.println("任务总数:"+Constants.list_urls.size()); Spider.create(new GetAnjukeListNum()) .addUrl(urls) .addPipeline(new ConsolePipeline()) .thread(30) .run(); MongoDBUtil.saveMany(..., list_insert); } }

    第一次用webmagic 很多东西不懂,也没有重写。

    很多都是用纯java实现

    让我们慢慢发现webmagic的强大吧!

  • 相关阅读:
    Windows JScript 在 游览器 中运行 调试 Shell 文件系统
    autohotkey 符号链接 软连接 symbolink
    软链接 硬链接 测试
    SolidWorks 修改 基准面 标准坐标系
    手机 路径 WebDAV 映射 驱动器
    Win10上手机路径
    explorer 命令行
    单位公司 网络 封锁 屏蔽 深信 AC
    cobbler自动化部署原理篇
    Docker四种网络模式
  • 原文地址:https://www.cnblogs.com/tnsay/p/10895284.html
Copyright © 2011-2022 走看看