框架:webmagic(java)
目标:随便找个爱爆照的小姐姐,将她所有发的照片整理出一个文件夹
代码写了个开头,数据处理和思路还要在理一下。
1 import us.codecraft.webmagic.Site; 2 import us.codecraft.webmagic.Spider; 3 import us.codecraft.webmagic.pipeline.JsonFilePipeline; 4 import us.codecraft.webmagic.processor.PageProcessor; 5 6 public class Main { 7 8 9 public static void main(String args[]) { 10 Spider.create(new zhihu()) 11 12 .addUrl("https://www.zhihu.com/people/JudyZhao/answers") 13 .addPipeline(new JsonFilePipeline("C:\judy\")) 14 //开启5个线程抓取 15 .thread(5) 16 //启动爬虫 17 .run(); 18 } 19 20 } 21 class zhihu implements PageProcessor{ 22 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); 23 @Override 24 public void process(Page page) { 25 page.putField("photo", page.getHtml().xpath("//div[@class=RichContent-inner]//img/@src").toString()); 26 if (page.getResultItems().get("photo") == null) { 27 //skip this page 28 page.setSkip(true); 29 } 30 page.addTargetRequests(page.getHtml().links().regex("https://www\.zhihu\.com/question/\d+/answer/\d+").all()); 31 32 } 33 34 35 public Site getSite() { 36 return site; 37 } 38 }
效果图如上,真的很好看:)