zoukankan      html  css  js  c++  java
  • java抓取网页数据,登录之后抓取数据。

    最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。

    也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。

    首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu.com/s/1mgqOuHa

    1,获取网页内容(核心代码,技术有限没封装)。

    2,登录之后抓取网页数据(如何在请求中携带cookie)。

    3,获取网站的ajax请求方法(返回json)。

    以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)

    一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试

    package com.minxinloan.black.web.utils;
    
    import java.io.BufferedReader;
    import java.io.ByteArrayOutputStream;
    import java.io.DataInputStream;
    import java.io.DataOutputStream;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.OutputStream;
    import java.io.PrintWriter;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.net.URLConnection;
    import java.net.URLEncoder;
    import java.nio.charset.Charset;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    import java.util.Map.Entry;
    import java.util.StringTokenizer;
    
    import net.sf.json.JSONArray;
    import net.sf.json.JSONObject;
    
    import org.jsoup.Connection;
    import org.jsoup.Connection.Method;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class CookieUtil {
    
        public final static String CONTENT_TYPE = "Content-Type";
    
        public static void main(String[] args) {
            
            //String loginURL = "http://www.p2peye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=Lsc66&username=puqiuxiaomao&password=a1234567";
            String listURL = "http://www.p2peye.com/blacklist.php?p=2";
            String logURL = "http://www.p2peye.com/member.php";
    
    
            //********************************需要登录的*************************************************
            try {
                    Connection.Response  res = 
                            Jsoup.connect(logURL)
                                .data("mod","logging"
                                        ,"action","login"
                                        ,"loginsubmit","yes"
                                        ,"loginhash","Lsc66"
                                        ,"username","puqiuxiaomao"
                                        ,"password","a1234567")
                                .method(Method.POST)
                                .execute();
                    
                    
                    //这儿的SESSIONID需要根据要登录的目标网站设置的session Cookie名字而定
                    Connection con=Jsoup.connect(listURL);
                    //设置访问形式(电脑访问,手机访问):直接百度都参数设置
                    con.header("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
                    //把登录信息的cookies保存如map对象里面
                    Map <String,String>  map=res.cookies();
                    Iterator<Entry<String,String>> it =map.entrySet().iterator();
                    while(it.hasNext()){
                        Entry<String,String> en= it.next(); 
                        //把登录的信息放入请求里面
                        con =con.cookie(en.getKey(), en.getValue());
                        
                    }
                    //再次获取Document对象。
                    Document objectDoc = con.get();
                    
                    Elements elements = objectDoc.getAllElements();//获取这个连接返回页面的源码内容(不是源码跟源码差不多)
                    for (Element element : elements) {
                        //element是迭代出来的标签:如:<div><span></span></div>
                        Elements elements2= element.getAllElements();//
                         for (Element element2 : elements2) {
                             element2.text();
                             element2.attr("href");//获取标签属性。element2代表a标签:href代表属性
                             element2.text();//获取标签文本
                        }
                    }
                    
                    //********************************不需要登录的*************************************************
                    
                    String URL = "http://www.p2peye.com/blacklist.php?p=2";
                    Document conTemp = Jsoup.connect(URL).get();
                    Elements elementsTemps = conTemp.getAllElements();
                     for (Element elementsTemp : elementsTemps) {
                         elementsTemp.text();
                         elementsTemp.attr("href");//获取标签属性。element2代表a标签:href代表属性
                         elementsTemp.text();//获取标签文本
                    }
                    
                    
                    //********************************ajax方法获取内容。。。*************************************************。
                     HttpURLConnection connection = null;
                        BufferedReader reader = null;
                        try {
                            StringBuffer sb = new StringBuffer();
                            URL getUrl = new URL(URL);
                            connection = (HttpURLConnection)getUrl.openConnection();
                            reader = new BufferedReader(new InputStreamReader(
                                    connection.getInputStream(),"utf-8"));
                            String lines;
                            while ((lines = reader.readLine()) != null) {
                                sb.append(lines);
                            };
                            List<Map<String, Object>> list = parseJSON2List(sb.toString());//json转换成list
                        } catch (Exception e) {
                            
                        } finally{
                            if(reader!=null)
                                try {
                                    reader.close();
                                } catch (IOException e) {
                                }
                            // 断开连接
                            connection.disconnect();
                        }
                    
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            
        }
        
    
        public static Map<String, Object> parseJSON2Map(String jsonStr){  
            Map<String, Object> map = new HashMap<String, Object>();  
            //最外层解析  
            JSONObject json = JSONObject.fromObject(jsonStr);  
            for(Object k : json.keySet()){  
                Object v = json.get(k);   
                //如果内层还是数组的话,继续解析  
                if(v instanceof JSONArray){  
                    List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();  
                    Iterator<JSONObject> it = ((JSONArray)v).iterator();  
                    while(it.hasNext()){  
                        JSONObject json2 = it.next();  
                        list.add(parseJSON2Map(json2.toString()));  
                    }  
                    map.put(k.toString(), list);  
                } else {  
                    map.put(k.toString(), v);  
                }  
            }  
            return map;  
        }  
        
        public static List<Map<String, Object>> parseJSON2List(String jsonStr){  
            JSONArray jsonArr = JSONArray.fromObject(jsonStr);  
            List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();  
            Iterator<JSONObject> it = jsonArr.iterator();  
            while(it.hasNext()){  
                JSONObject json2 = it.next();  
                list.add(parseJSON2Map(json2.toString()));  
            }  
            return list;  
        }  
        
        
    
    }

    二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)

    package com.minxinloan.black.web.utils;
    
    import java.io.BufferedReader;
    import java.io.DataInputStream;
    import java.io.DataOutputStream;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.FileWriter;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.PrintWriter;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.net.URLConnection;
    import java.nio.charset.Charset;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.StringTokenizer;
    
    public class Utils {//解析验证码的
        public static Content getRandom(String method, String sUrl,// 要解析的url
                Map<String, String> paramMap, // 存放用户名和密码的map
                Map<String, String> requestHeaderMap,// 存放COOKIE的map
                boolean isOnlyReturnHeader, String path) {
    
            Content content = null;
            HttpURLConnection httpUrlConnection = null;
            InputStream in = null;
            try {
                URL url = new URL(sUrl);
                boolean isPost = "POST".equals(method);
                if (method == null
                        || (!"GET".equalsIgnoreCase(method) && !"POST"
                                .equalsIgnoreCase(method))) {
                    method = "POST";
                }
                URL resolvedURL = url;
                URLConnection urlConnection = resolvedURL.openConnection();
                httpUrlConnection = (HttpURLConnection) urlConnection;
                httpUrlConnection.setRequestMethod(method);
                httpUrlConnection.setRequestProperty("Accept-Language",
                        "zh-cn,zh;q=0.5");
                // Do not follow redirects, We will handle redirects ourself
                httpUrlConnection.setInstanceFollowRedirects(false);
                httpUrlConnection.setDoOutput(true);
                httpUrlConnection.setDoInput(true);
                httpUrlConnection.setConnectTimeout(5000);
                httpUrlConnection.setReadTimeout(5000);
                httpUrlConnection.setUseCaches(false);
                httpUrlConnection.setDefaultUseCaches(false);
                httpUrlConnection.connect();
    
                int responseCode = httpUrlConnection.getResponseCode();
    
                if (responseCode == HttpURLConnection.HTTP_OK
                        || responseCode == HttpURLConnection.HTTP_CREATED) {
                    byte[] bytes = new byte[0];
                    if (!isOnlyReturnHeader) {
                        DataInputStream ins = new DataInputStream(
                                httpUrlConnection.getInputStream());
                        // 验证码的位置
                        DataOutputStream out = new DataOutputStream(
                                new FileOutputStream(path + "/code.bmp"));
                        byte[] buffer = new byte[4096];
                        int count = 0;
                        while ((count = ins.read(buffer)) > 0) {
                            out.write(buffer, 0, count);
                        }
                        out.close();
                        ins.close();
                    }
                    String encoding = null;
                    if (encoding == null) {
                        encoding = getEncodingFromContentType(httpUrlConnection
                                .getHeaderField(""));
                    }
                    content = new Content(sUrl, new String(bytes, encoding),
                            httpUrlConnection.getHeaderFields());
                }
            } catch (Exception e) {
                return null;
            } finally {
                if (httpUrlConnection != null) {
                    httpUrlConnection.disconnect();
                }
            }
            return content;
        }
    
        public static String getEncodingFromContentType(String contentType) {
            String encoding = null;
            if (contentType == null) {
                return null;
            }
            StringTokenizer tok = new StringTokenizer(contentType, ";");
            if (tok.hasMoreTokens()) {
                tok.nextToken();
                while (tok.hasMoreTokens()) {
                    String assignment = tok.nextToken().trim();
                    int eqIdx = assignment.indexOf('=');
                    if (eqIdx != -1) {
                        String varName = assignment.substring(0, eqIdx).trim();
                        if ("charset".equalsIgnoreCase(varName)) {
                            String varValue = assignment.substring(eqIdx + 1)
                                    .trim();
                            if (varValue.startsWith(""")
                                    && varValue.endsWith(""")) {
                                // substring works on indices
                                varValue = varValue.substring(1,
                                        varValue.length() - 1);
                            }
                            if (Charset.isSupported(varValue)) {
                                encoding = varValue;
                            }
                        }
                    }
                }
            }
            if (encoding == null) {
                return "UTF-8";
            }
            return encoding;
        }
    
        // 这个是输出
        public static boolean inFile(String content, String path) {
            PrintWriter out = null;
            File file = new File(path);
            try {
                if (!file.exists()) {
                    file.createNewFile();
                }
                out = new PrintWriter(new FileWriter(file));
    
                out.write(content);
                out.flush();
                return true;
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                out.close();
            }
            return false;
        }
    
        public static String getHtmlReadLine(String httpurl) {
            String CurrentLine = "";
            String TotalString = "";
            InputStream urlStream;
            String content = "";
    
            try {
                URL url = new URL(httpurl);
    
                HttpURLConnection connection = (HttpURLConnection) url
                        .openConnection();
    
                connection.connect();
                System.out.println(connection.getResponseCode());
                urlStream = connection.getInputStream();
    
                BufferedReader reader = new BufferedReader(
    
                new InputStreamReader(urlStream, "utf-8"));
    
                while ((CurrentLine = reader.readLine()) != null) {
                    TotalString += CurrentLine + "
    ";
                }
    
                content = TotalString;
    
            } catch (Exception e) {
            }
    
            return content;
        }
    }
    
    
    class Content {
        private String url;
        private String body;
        private Map<String, List<String>> m_mHeaders = new HashMap<String, List<String>>();
    
        public Content(String url, String body, Map<String, List<String>> headers) {
            this.url = url;
            this.body = body;
            this.m_mHeaders = headers;
        }
    
        public String getUrl() {
            return url;
        }
    
        public String getBody() {
            return body;
        }
    
        public Map<String, List<String>> getHeaders() {
            return m_mHeaders;
        }
    
    }
  • 相关阅读:
    BP神经网络的数学原理及其算法实现
    机器学习必知的10大算法
    支持向量机通俗导论(理解SVM的三层境界)
    svm算法 最通俗易懂讲解
    read/write/fsync与fread/fwrite/fflush的关系和区别
    ubuntu16.04编译QT5.6所依赖的库
    Linux下的tar压缩解压缩命令详解
    Ubuntu16下编译linux内核,报"mkimage" command not found错的解决
    Git之(一)Git是什么[转]
    mac上完整卸载删除:android studio方案
  • 原文地址:https://www.cnblogs.com/puqiuxiaomao/p/4037918.html
Copyright © 2011-2022 走看看