zoukankan      html  css  js  c++  java
  • java.net.*爬取网页,Jsoup解析网页内容

    java.net.* 建立网络连接

    Jsoup解析网页内容

    package com.sun.util;
    
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.io.OutputStreamWriter;
    import java.io.PrintWriter;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.net.URLConnection;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class DataDownUtil {
        /**
         * @author UPO
         * @param url
         * @param encoding
         * @return String 网页的源代码
         * <a href="http://www.baidu.com">百度</a>
         * <a href="https://movie.douban.com/subject/3168101/comments?start=0&limit=20&sort=new_score&status=P">爬取的网页</a>
         *                 
         */
        public static String getHtmlResourceByUrl(String url,String encoding){
            StringBuffer buffer=new StringBuffer();
            URL urlobj=null;
            URLConnection uc=null;
            InputStreamReader isr=null;
            BufferedReader reader=null;
            try {
                //建立网络连接
                urlobj=new URL(url);
                //打开网络
                uc=urlobj.openConnection();
                //建立文件输入流的对象
                isr=new InputStreamReader(uc.getInputStream(), encoding);
                //建立文件缓冲写入流(相当于ctrl+v放入内存中)
                reader=new BufferedReader(isr);
                
                //建立临时变量
                String temp=null;
                while((temp=reader.readLine())!=null){
                    buffer.append(temp);
                    //buffer.append("
    ");
                }
                
            } catch (MalformedURLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                System.out.println("网络连接不可用");
            }catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                System.out.println("网络连接失败");
            }finally {
                if(isr!=null){
                    try {
                        isr.close();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
            return buffer.toString();
        }
        public static String getContext(){
            
            StringBuffer context=new StringBuffer();
            int start=0;
            while(start>=0&&start<=60){
                
                //查看网页url地址栏
                String url="https://movie.douban.com/subject/26266893/comments?start="+start+"&limit=20&sort=new_score&status=P";
                String encoding="utf-8";
                //观察可知每一页加载20个评价item
                start=start+20;
                //1.获取网页源代码
                String html=getHtmlResourceByUrl(url, encoding);
                //System.out.println(html);
                //2.解析
                Document document=Jsoup.parse(html);
                //3.最外层的id是:comments
                Element element=document.getElementById("comments");
                //4.里面的每一个item的id是:comment-item
                Elements elements=element.getElementsByClass("comment-item");
                for (Element ele : elements) {
                    //https://movie.douban.com/subject/3168101/comments?start=20&limit=20&sort=new_score&status=P
                    String name=ele.getElementsByTag("a").last().text();
                    String desc=ele.getElementsByClass("short").text();
                    String time=ele.getElementsByClass("comment-time").text();
                    String votes=ele.getElementsByClass("votes").text();
                    //System.out.println("
    name:"+name+"
    desc:"+desc+"
    time:"+time+"
    votes:"+votes);
                    context.append("
    ");
                    context.append("name:"+name+"
    desc:"+desc+"
    time:"+time+"
    votes:"+votes);
                    context.append("
    ");
                }
            }
            System.out.println(context);
            return context.toString();
        }
    
        /**
         * 将文件一行行写入到文件中
         * @author 孙敬钦
         * @version 1.0
         * @param content 解析到的文件内容
         * @param filePath 存储的文件名字
         * @return void
         */
        public static void writeFileByLine(String context,String filePath){
            File file=new File(filePath);
            PrintWriter printWriter=null;;
            try {
                printWriter=new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
                printWriter.print(context);
                printWriter.flush();
            } catch (FileNotFoundException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }finally {
                //关闭printWriter
                if(printWriter!=null){
                    printWriter.close();
    
                }
    
            }
        }
        
        public static void main(String[] args) {
            System.out.println("你好阿泡");
            //1.得到解析的网页数据
            String context=getContext();
            
            System.out.println(context);
            //2.保存到txt文件
            String filePath="D:/movie/bigdata.txt";
            
            writeFileByLine(context, filePath);
            //3.保存到hdfs文件系统
            
            
        }
    
    }
  • 相关阅读:
    本地项目上传到github
    linux 常用命令
    mysql 查询日志基本操作
    js 短信60秒倒计时
    windows下 mysql 移库
    INSERT INTO table(xxx) VALUES (xxx)
    springboot 项目接口调用失败
    P1093 奖学金
    P1403约数研究
    P1147连续自然数和
  • 原文地址:https://www.cnblogs.com/sunupo/p/13409602.html
Copyright © 2011-2022 走看看