zoukankan      html  css  js  c++  java
  • webmagic学习之路-1:采集安居客列表页测试

    ---恢复内容开始---

    package com.action;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import javax.management.JMException;
    
    import org.bson.Document;
    
    import com.model.Model_AnjukeList;
    import com.mongodb.BasicDBObject;
    import com.util.Constants;
    import com.util.GetDate;
    import com.util.MD5With32;
    import com.util.MongoDBUtil;
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.monitor.SpiderMonitor;
    import us.codecraft.webmagic.pipeline.ConsolePipeline;
    import us.codecraft.webmagic.processor.PageProcessor;
    
    public class GetAnjukeListNum implements PageProcessor {
    
        public static Model_AnjukeList anjukeList;
        public static List<String> list = new ArrayList<String>();
        public static List<BasicDBObject> list_insert = new ArrayList<BasicDBObject>();
        private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8")
                .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
        @Override
        public Site getSite() {
            // TODO Auto-generated method stub
            return this.site;
        }
        
        @Override
        public void process(Page page) {
            String found = null;
            BasicDBObject  doc = null;
            page.addTargetRequests(Constants.list_urls);
            System.out.println("code:"+page.getStatusCode());
            String pg = page.getHtml().toString();
            if(pg.length()>100){
                Pattern p = Pattern.compile(Constants.anjuke_Reg_Found);
                Matcher m = p.matcher(pg);
                while(m.find()){
                    found = m.group(0).replace(""found":", "").replace(",", "");
                    String id = MD5With32.encryption(page.getUrl().toString());
                    if(!Constants.map_urls.containsKey(id)){
                        continue;
                    }
                    Model_AnjukeList model_AnjukeList = Constants.map_urls.get(id);
    //mongo存储! doc
    = new BasicDBObject("_id",id) .append("city", model_AnjukeList.getCity()) .append("towards", model_AnjukeList.getTowards()) .append("zone_urls", model_AnjukeList.getZone_urls()) .append("zone", model_AnjukeList.getZone()) .append("site", model_AnjukeList.getSite()) .append("decoration", model_AnjukeList.getDecoration()) .append("flag", model_AnjukeList.getFlag()) .append("street", model_AnjukeList.getStreet()) .append("type", model_AnjukeList.getType()) .append("page", model_AnjukeList.getPage()) .append("urls", model_AnjukeList.getUrls()) .append("found", found) .append("update_time", model_AnjukeList.getUpdate_time()) ; list_insert.add(doc); } } } public static void main(String[] args) { String city = "北京"; String urls = "https://beijing.anjuke.com/sale/"; MongoGetUrls.GetMongoUrls(city); System.out.println("任务总数:"+Constants.list_urls.size()); Spider.create(new GetAnjukeListNum()) .addUrl(urls) .addPipeline(new ConsolePipeline()) .thread(30) .run(); MongoDBUtil.saveMany(..., list_insert); } }

    第一次用webmagic 很多东西不懂,也没有重写。

    很多都是用纯java实现

    让我们慢慢发现webmagic的强大吧!

  • 相关阅读:
    ssh 免密码登陆设置不成功
    mysql: SOURCE error 2?
    Debug --> 服务器上运行代码的not find module错误
    Debug --> 使用服务器的一些日常记录
    Debug --> 使用pycharm(pro)部署项目至服务器
    Debug --> python 将输出至控制台的信息存入指定txt文件
    Debug --> matlibplot的字体设置方法
    Machine Learning --> MSE&RMSE&MAE
    Debug --> 奇奇怪怪的显卡调用错误
    Debug --> Variable,Tensor,Numpy的转换
  • 原文地址:https://www.cnblogs.com/tnsay/p/10895284.html
Copyright © 2011-2022 走看看