zoukankan      html  css  js  c++  java
  • Jsoup应用对比测试

    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.ArrayList;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.select.Elements;
    import org.jsoup.select.Selector.SelectorParseException;
    
    public class ICBCategoryTest {
        public void print(String s) {
            System.out.println(s);
        }
    
        public String[] ICB(String URL) throws IOException {
            String[] result = {"","","","","","","","","","","","",""};
            try {
                Document doc = Jsoup.connect(URL).timeout(120000).get();
                // title
                try {
                    Elements title = doc.select("title");
                    result[0] = title.text();
    //                System.out.println(result[0]);
                } catch (SelectorParseException e) {
                    result[0] = "";
                }
                // descrption
                try {
                    Elements description = doc.select("meta[name]");
                    result[1] = description.attr("content");
                } catch (SelectorParseException e) {
                    result[1] = "";
                }
                // canonical
                try {
                    Elements canonical = doc.select("link[rel=canonical]");
                    result[2] = canonical.attr("href");
                    result[2] = result[2].replaceAll("www.internetcorkboard.", "staging.internetcorkboard.");
                } catch (SelectorParseException e) {
                    result[2] = "";
                }
                // og:site_name
                try {
                    Elements site_name = doc.select("meta[property=og:site_name]");
                    result[3] = site_name.attr("content");
                } catch (SelectorParseException e) {
                    result[3] = "";
                }
                // og:image:width
                try {
                    Elements image_width = doc.select("meta[property=og:image:width]");
                    result[4] = image_width.attr("content");
                } catch (SelectorParseException e) {
                    result[4] = "";
                }
                // og:image:height
                try {
                    Elements image_height = doc
                            .select("meta[property=og:image:height]");
                    result[5] = image_height.attr("content");
                } catch (SelectorParseException e) {
                    result[5] = "";
                }
                // og:title
                try {
                    Elements og_title = doc.select("meta[property=og:title]");
                    result[6] = og_title.attr("content");
                } catch (SelectorParseException e) {
                    result[6] = "";
                }
                // og:description
                try {
                    Elements og_description = doc.select("meta[property=og:description]");
                    result[7] = og_description.attr("content");
                } catch (SelectorParseException e) {
                    result[7] = "";
                }
                // og:url
                try{
                    Elements og_url = doc.select("meta[property=og:description]");
                    result[8] = og_url.attr("content");
                    result[8] = result[8].replaceAll("www.", "staging");
                }catch(SelectorParseException e){
                    result[8] = "";
                }
                // og:type
                try{
                    Elements og_type = doc.select("meta[property=og:description]");
                    result[9] = og_type.attr("content");
                }catch(SelectorParseException e){
                    result[9] = "";
                }
                //body
                try{
                    Elements body = doc.getElementsByClass("NoAdsBody");
                    result[10] = body.text();
                }catch(SelectorParseException e){
                    result[10] = "";
                }
                //related articles
                try{
                    Elements related = doc.getElementsByClass("relatedarticles");
                    result[11] = related.text();
                }catch(SelectorParseException e){
                    result[11] = "";
                }
                //you may also like
                try{
                    Elements related = doc.getElementsByClass("rgtitle");
                    result[12] = related.text();
                }catch(SelectorParseException e){
                    result[12] = "";
                }
                return(result);
                
            } catch (java.lang.NullPointerException e) {
                System.out.println("null   "+URL);
            } catch (org.jsoup.HttpStatusException e) {
                int i = e.getStatusCode();
                System.out.println(i+"  "+URL);
            } catch(java.net.ConnectException e){
                System.out.println("Time out :"+URL);
            }
            return result;
        }
    
        public static void main(String args[]) throws IOException {
            ArrayList<String[]> a=new ArrayList<String[]>();
            ArrayList<String[]> b=new ArrayList<String[]>();
            ArrayList<String> CategoryUrl=new ArrayList<String>();
            File f1 = new File("C:/ICBTest/CategoryUrl.txt");
            File f2 = new File("C:/ICBTest/CategoryError.txt");
            String line = "";
            String Url="";
            FileReader reader = new FileReader(f1);
            FileWriter writer = new FileWriter(f2, true);
            BufferedReader br = new BufferedReader(reader);
            BufferedWriter bw = new BufferedWriter(writer);
            while ((line = br.readLine()) != null) {
                CategoryUrl.add(line);
                Url = "http://www.internetcorkboard.com"+line+"?source=miva";            
                a.add(new ICBCategoryTest().ICB(Url));
                Url = "http://staging.internetcorkboard.com"+line+"?source=miva";
                b.add(new ICBCategoryTest().ICB(Url));
            }
            String[] list={"title","descrption","canonical","og:site_name","og:image:width","og:image:height","og:title","og:description","og:url","og:type","body","related articles","you may also like"};
            if(a.size()==b.size()){
                for(int i=0;i<a.size();i++){
                    String[] aa=a.get(i);
                    String[] bb=b.get(i);
                    String url=CategoryUrl.get(i);
                    for(int j=0;j<aa.length;j++){
                        if(aa[j].equals(bb[j])==false){
                            bw.write("Error:"+"\t"+url+"\t"+list[j]);
                            bw.newLine();
                            bw.flush();
                        }
                    }
                }
            }else{
                System.out.println("总数不一致");
            }
            br.close();
            bw.close();
        }
    }
  • 相关阅读:
    排序算法——选择排序
    poj1906
    poj1496
    poj1244
    poj1183
    poj1806
    !!!舒尔茨自律神经训练法
    Google Analytics的能与不能
    乔布斯最伟大的贡献是什么
    冥想呼吸
  • 原文地址:https://www.cnblogs.com/leonxiaosi/p/3024095.html
Copyright © 2011-2022 走看看