zoukankan      html  css  js  c++  java
  • java 爬取 国税局 省市区级联关系

    爬取网址 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html

    因为数据比较大,存储为一个json,会内存溢出。

    所以按照每个省市进行存储。

    同时因为远程访问链接拿取数据,所以会将已经拿到网页进行缓存,以便下次使用。

    package com.witwicky.jsoup;
    
    import com.google.gson.Gson;
    import com.google.gson.GsonBuilder;
    import com.witwicky.vo.CrawlingVo;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.*;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.Random;
    
    public class Crawling {
        private static final String BASE_SAVE_DIR = "E:\工作\extract";
        private static final String RESULT_SAVE_DIR = "E:\工作\extract_result";
    
        public static void main(String[] args) throws Exception {
            Gson gsonPretty = new GsonBuilder().setPrettyPrinting().create();
            Gson gsonSimple = new GsonBuilder().create();
            List<CrawlingVo> crawlingVos = new ArrayList<CrawlingVo>();
            Elements select = getElements("index.html", "tr.provincetr > td > a");
            for (Element element : select) {
                List<CrawlingVo> crawlingVos1 = new ArrayList<CrawlingVo>();
                String val = element.attr("href");
                crawlingVos.add(new CrawlingVo(val.substring(0, val.indexOf(".")), element.text(), crawlingVos1));
    
                String baseUrl = element.attr("href");
                String baseUrlPre = baseUrl.substring(0, baseUrl.indexOf("."));
                Elements ele = getElements(baseUrl, "tr.citytr");
                for (Element nextE : ele) {
                    List<CrawlingVo> crawlingVos2 = new ArrayList<CrawlingVo>();
                    crawlingVos1.add(new CrawlingVo(nextE.select("td:eq(0) a").text(), nextE.select("td:eq(1) a").text(), crawlingVos2));
    
                    String href = nextE.select("td:eq(1) a").attr("href");
                    String substring = href.substring(0, baseUrl.indexOf("."));
                    Elements contryElements = getElements(href, "tr.countytr");
                    for (Element contryElement : contryElements) {
                        List<CrawlingVo> crawlingVos3 = new ArrayList<CrawlingVo>();
                        crawlingVos2.add(new CrawlingVo(contryElement.select("td:eq(0) a").text(), contryElement.select("td:eq(1) a").text(), crawlingVos3));
    
                        String href1 = contryElement.select("td:eq(1) a").attr("href");
    
                        if (!"".equalsIgnoreCase(href1)) {
                            String substring1 = href1.substring(0, baseUrl.indexOf("."));
                            Elements elements = getElements(substring + "/" + href1, "tr.towntr");
                            for (Element element1 : elements) {
                                List<CrawlingVo> crawlingVos4 = new ArrayList<CrawlingVo>();
                                crawlingVos3.add(new CrawlingVo(element1.select("td:eq(0) a").text(), element1.select("td:eq(1) a").text(), crawlingVos4));
    
                                String href2 = element1.select("td:eq(1) a").attr("href");
                                Elements elements1 = getElements(baseUrlPre + "/" + substring1 + "/" + href2, "tr.villagetr");
                                for (Element element2 : elements1) {
                                    crawlingVos4.add(new CrawlingVo(element2.select("td:eq(0)").text(), element2.select("td:eq(2)").text(), new ArrayList<CrawlingVo>()));
                                }
                            }
                        }
                    }
                }
                save2File(gsonSimple.toJson(crawlingVos), element.text() + ".json", RESULT_SAVE_DIR);
                save2File(gsonPretty.toJson(crawlingVos), element.text() + "_pretty.json", RESULT_SAVE_DIR);
    
                System.out.println(element.text() + " is complete!");
            }
        }
    
        private static Elements getElements(String u, String selector) throws IOException, InterruptedException {
            String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/" + u;
            String cleanUrl = cleanName(url);
            Document select = null;
            File localFile = new File(BASE_SAVE_DIR, cleanUrl);
            if (localFile.exists()) {
                select = Jsoup.parse(localFile, "UTF-8");
            }
            boolean remoteUrl = false;
            if (select == null) {
                int intRd = new Random().nextInt(5) + 1;
                Thread.sleep(intRd * 1000);
                select = Jsoup.connect(url).get();
                remoteUrl = true;
            }
            if (remoteUrl) {
                save2File(select.toString(), cleanName(url), BASE_SAVE_DIR);
            }
            return select.select(selector);
        }
    
        private static String cleanName(String name) {
            return name
                    .replace("\", "_")
                    .replace("/", "_")
                    .replace("//", "_")
                    .replace(".", "_")
                    .replace(":", "_");
        }
    
        private static void save2File(String content, String fileName, String saveDir) {
            File dir = new File(saveDir);
            if (!dir.exists()) {
                boolean mkdirs = dir.mkdirs();
                if (!mkdirs) {
                    return;
                }
            }
    
            File file = new File(dir, fileName);
            if (file.exists()) {
               return;
            }
    
            try {
                FileOutputStream outSTr = new FileOutputStream(file);
                BufferedOutputStream Buff = new BufferedOutputStream(outSTr);
                Buff.write(content.getBytes());
                Buff.flush();
                Buff.close();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    
    package com.witwicky.vo;
    
    import java.util.List;
    
    public class CrawlingVo {
        private String value;
        private String label;
        private List<CrawlingVo> children;
    
        public CrawlingVo() {
        }
    
        public CrawlingVo(String value, String label, List<CrawlingVo> children) {
            this.value = value;
            this.label = label;
            this.children = children;
        }
    
        public String getValue() {
            return value;
        }
    
        public void setValue(String value) {
            this.value = value;
        }
    
        public String getLabel() {
            return label;
        }
    
        public void setLabel(String label) {
            this.label = label;
        }
    
        public List<CrawlingVo> getChildren() {
            return children;
        }
    
        public void setChildren(List<CrawlingVo> children) {
            this.children = children;
        }
    }
    
    \审判系统
    [\Shěnpàn xìtǒng]
    \ trial system
  • 相关阅读:
    SpringBoot中并发定时任务的实现、动态定时任务的实现(看这一篇就够了)
    压力测试
    AlertManger的详细配置
    http://www.linuxe.cn/post-518.html
    https://helpcdn.aliyun.com/knowledge_detail/194196.html
    AIOPS智能监控团队
    普罗新修斯监控mysql数据库1
    干货 | Elasticsearch集群黄色原因的终极探秘
    elasticsearch集群在生产上面必看的优化文章
    干货丨DolphinDB与Elasticserach在金融数据集上的性能对比测试
  • 原文地址:https://www.cnblogs.com/hfultrastrong/p/10650614.html
Copyright © 2011-2022 走看看