zoukankan      html  css  js  c++  java
  • webmagic学习之路-1:采集安居客列表页测试

    ---恢复内容开始---

    package com.action;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import javax.management.JMException;
    
    import org.bson.Document;
    
    import com.model.Model_AnjukeList;
    import com.mongodb.BasicDBObject;
    import com.util.Constants;
    import com.util.GetDate;
    import com.util.MD5With32;
    import com.util.MongoDBUtil;
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.monitor.SpiderMonitor;
    import us.codecraft.webmagic.pipeline.ConsolePipeline;
    import us.codecraft.webmagic.processor.PageProcessor;
    
    public class GetAnjukeListNum implements PageProcessor {
    
        public static Model_AnjukeList anjukeList;
        public static List<String> list = new ArrayList<String>();
        public static List<BasicDBObject> list_insert = new ArrayList<BasicDBObject>();
        private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8")
                .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
        @Override
        public Site getSite() {
            // TODO Auto-generated method stub
            return this.site;
        }
        
        @Override
        public void process(Page page) {
            String found = null;
            BasicDBObject  doc = null;
            page.addTargetRequests(Constants.list_urls);
            System.out.println("code:"+page.getStatusCode());
            String pg = page.getHtml().toString();
            if(pg.length()>100){
                Pattern p = Pattern.compile(Constants.anjuke_Reg_Found);
                Matcher m = p.matcher(pg);
                while(m.find()){
                    found = m.group(0).replace(""found":", "").replace(",", "");
                    String id = MD5With32.encryption(page.getUrl().toString());
                    if(!Constants.map_urls.containsKey(id)){
                        continue;
                    }
                    Model_AnjukeList model_AnjukeList = Constants.map_urls.get(id);
    //mongo存储! doc
    = new BasicDBObject("_id",id) .append("city", model_AnjukeList.getCity()) .append("towards", model_AnjukeList.getTowards()) .append("zone_urls", model_AnjukeList.getZone_urls()) .append("zone", model_AnjukeList.getZone()) .append("site", model_AnjukeList.getSite()) .append("decoration", model_AnjukeList.getDecoration()) .append("flag", model_AnjukeList.getFlag()) .append("street", model_AnjukeList.getStreet()) .append("type", model_AnjukeList.getType()) .append("page", model_AnjukeList.getPage()) .append("urls", model_AnjukeList.getUrls()) .append("found", found) .append("update_time", model_AnjukeList.getUpdate_time()) ; list_insert.add(doc); } } } public static void main(String[] args) { String city = "北京"; String urls = "https://beijing.anjuke.com/sale/"; MongoGetUrls.GetMongoUrls(city); System.out.println("任务总数:"+Constants.list_urls.size()); Spider.create(new GetAnjukeListNum()) .addUrl(urls) .addPipeline(new ConsolePipeline()) .thread(30) .run(); MongoDBUtil.saveMany(..., list_insert); } }

    第一次用webmagic 很多东西不懂,也没有重写。

    很多都是用纯java实现

    让我们慢慢发现webmagic的强大吧!

  • 相关阅读:
    神经网络层数问题
    matlab入门笔记(六):编程基础之M文件
    工字电感,色环电感,功率电感选型区别
    x电容和Y电容
    nohup.out文件过大解决方法 定时任务清空
    WebRTC的视频解码原理简析
    activemq 安装 部署
    WebRTC信令控制简介与STUN, TURN服务器搭建
    如何搭建WebRTC信令服务器
    ZooKeeper安装和配置
  • 原文地址:https://www.cnblogs.com/tnsay/p/10895284.html
Copyright © 2011-2022 走看看