zoukankan      html  css  js  c++  java
  • java学习--网络爬虫

    目录

       1.jar包----jsoup

      2.解析步骤(爬去51job网页信息)

    一、jar包----jsoup

      jsoup包是开源的html解析工具包

      jsoup包下载链接http://www.mvnjar.com/org.jsoup/jsoup/1.11.3/detail.html

    二、解析步骤(爬取51job网页信息)

      1.打开需要爬取的网页

       2.通过链接对象获取文档对象

      3.通过jsoup中的select()解析文档对象

       4.创建一个实体类,内容包括需要爬取的信息

       5.将select查找出来的信息放到实体类中,并将这些对象保存到集合数组中

    ps:利用jsoup解析html需要遵循html的语法

      

    package com.work.crawler;
    /**
     * 工作信息
     * @author Hu YS
     *
     * 2018年9月1日
     */
    public class Work implements Comparable<Work>{
        private String position;//职位
        private String company;//公司
        private String place;//工作地点
        private String salary;//薪资
        private String date;//发布时间
        public String getPosition() {
            return position;
        }
        public void setPosition(String position) {
            this.position = position;
        }
        public String getCompany() {
            return company;
        }
        public void setCompany(String company) {
            this.company = company;
        }
        public String getPlace() {
            return place;
        }
        public void setPlace(String place) {
            this.place = place;
        }
        public String getSalary() {
            return salary;
        }
        public void setSalary(String salary) {
            this.salary = salary;
        }
        public String getDate() {
            return date;
        }
        public void setDate(String date) {
            this.date = date;
        }
        @Override
        public String toString() {
            return "Work [position=" + position + ", company=" + company + ", place=" + place + ", salary=" + salary
                    + ", date=" + date + "]";
        }
        /**
         * 排序规则
         */
        @Override
        public int compareTo(Work o) {
            int i1 = Integer.parseInt(this.getDate().substring(0, 2));
            int i2 = Integer.parseInt(this.getDate().substring(3, 5));
            int o1 = Integer.parseInt(o.getDate().substring(0, 2));
            int o2 = Integer.parseInt(o.getDate().substring(3, 5));
            
            if(i1>=i2) {
                if(o1>o2) {
                    return -1;
                }
                else if(o1<o2){
                    return 1;
                }else {
                    return 0;
                }
            }else if(i1<i2){
                    return 1;
                        
            }
                return 0;
            
        }
    }
    工作信息实体类
    package com.work.crawler;
    
    import java.io.BufferedWriter;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.List;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import net.sf.json.JSONArray;
    /**
     * 网页爬取线程
     * @author Administrator
     *
     */
    public class Crawler implements Runnable {
        //打开地址
        private String url;
        //保存集合
        private List<Work> list;
        public Crawler(String url,List<Work> list) {
            this.list=list;
            this.url=url;
        }
        @Override
        public void run() {
            try {
                //链接网页获取document文档
                Document doc = Jsoup.connect(url).timeout(5000).get();
                //解析有效内容
                Elements eles = doc.select(".dw_table .el:gt(2)");
                for (Element element : eles) {
                    Work work = new Work();
                    String position = element.select(".t1 span a").text();
                    String company =element.select(".t2 a").text();
                    String place = element.select(".t3").text();
                    String salary = element.select(".t4").text();
                    String date = element.select(".t5").text();
                    work.setCompany(company);
                    work.setDate(date);
                    work.setPlace(place);
                    work.setSalary(salary);
                    work.setPosition(position);
                    System.out.println(work);
                    list.add(work);
                    
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        /**
         * 将爬取后的信息转换成json文件并写入本地文件
         * @param list    需要转换的list数组
         */
        public static void save(List<Work> list) {
            BufferedWriter bw=null;
            try {
                //文件以追加的形式写入json文件
                    bw = new BufferedWriter(new FileWriter("E:\用户\Desktop\目标\1.json",true)) ;
                    //将整个
                    JSONArray fromObject = JSONArray.fromObject(list);
                    bw.write(fromObject.toString());
                    bw.newLine();
                    bw.flush();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }finally {
                try {
                    bw.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
    }
    爬虫线程
     1 package com.work.crawler;
     2 
     3 import java.util.ArrayList;
     4 import java.util.List;
     5 import java.util.concurrent.ExecutorService;
     6 import java.util.concurrent.Executors;
     7 /**
     8  * 利用线程池爬取网页内容
     9  * @author Administrator
    10  *
    11  */
    12 public class Main {
    13     static List<Work> list = new ArrayList<>();
    14     public static void main(String[] args) {
    15         long s1 = System.currentTimeMillis();
    16         int count = 1;
    17         //创建线程池
    18         ExecutorService es = Executors.newCachedThreadPool();
    19         while(true) {
    20             //当爬取到了150页的时候停止爬取
    21             if(count == 150) {
    22                 break;
    23             }
    24             //爬取线程的url
    25             String url = "https://search.51job.com/list/010000,000000,0000,00,9,99,Java%2B%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,"+count+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
    26             count+=1;
    27             //执行线程池的线程(url=路径,list=保存的数组)
    28             es.execute(new Crawler(url, list));
    29         }
    30         //关闭线程池
    31         es.shutdown();
    32         //不断询问线程池是否关闭
    33         while(true) {
    34             //当线程池关闭保存到本地
    35             if(es.isTerminated()) {
    36                 Crawler.save(list);
    37                 System.out.println("over");
    38                 break;
    39             }
    40         }
    41         long s2 = System.currentTimeMillis();
    42         System.out.println(s2-s1);
    43     }
    44 }
    利用线程池爬取网页
  • 相关阅读:
    【leetcode_medium】78. Subsets
    【opencv基础】随机颜色生成
    【leetcode_easy_array】1566. Detect Pattern of Length M Repeated K or More Times
    XSSFSheet对象的格式设置(转)
    Devexpress控件使用技巧
    Visual Studio 2017社区版安装C++开发环境(转)
    DevExpress GridControl添加选择框的两种方法
    DevExpress GridControl使用教程:之 添加 checkbox 复选框(转)
    DevExpress中GridControl中实现checkbox多行选中(转)
    C#开发WinForm窗体程序时,如何在子窗体中关闭窗口时并退出程序?(转)
  • 原文地址:https://www.cnblogs.com/bananafish/p/9704814.html
Copyright © 2011-2022 走看看