zoukankan      html  css  js  c++  java
  • 使用Post方法模拟登陆爬取网页

    最近弄爬虫,遇到的一个问题就是如何使用post方法模拟登陆爬取网页。下面是极简版的代码:

    
    import java.io.BufferedReader;
    import java.io.InputStreamReader;
    import java.io.OutputStreamWriter;
    import java.io.PrintWriter;
    
    import java.net.HttpURLConnection;
    import java.net.URL;
    
    import java.util.HashMap;
    
    public class test {
    
    	//post请求地址
    	private static final String POST_URL = "";
    	
    	//模拟谷歌浏览器请求
    	private static final String USER_AGENT = "";
    	
    	//用账号登录某网站后 请求POST_URL链接获取cookie
    	private static final String COOKIE = "";
    	
    	//用账号登录某网站后 请求POST_URL链接获取数据包
    	private static final String REQUEST_DATA =  "";
    	
    	public static void main(String[] args) throws Exception {
    		HashMap<String, String> map = postCapture(REQUEST_DATA);
    		String responseCode = map.get("responseCode");
    		String value = map.get("value");
    		
    		while(!responseCode.equals("200")){
    			map =  postCapture(REQUEST_DATA);
    			responseCode = map.get("responseCode");
    			value = map.get("value");
    		}
    		
    		//打印爬取结果
    		System.out.println(value);
    	}
    	
    	private static HashMap<String, String> postCapture(String requestData) throws Exception{
    		HashMap<String, String> map = new HashMap<>();
    		
    		URL url = new URL(POST_URL);
    		HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
    		httpConn.setDoInput(true); // 设置输入流采用字节流
    		httpConn.setDoOutput(true); // 设置输出流采用字节流
    		httpConn.setUseCaches(false); //设置缓存
    		httpConn.setRequestMethod("POST");//POST请求
    		httpConn.setRequestProperty("User-Agent", USER_AGENT);
    		httpConn.setRequestProperty("Cookie", COOKIE);
    		
    		PrintWriter out = new PrintWriter(new OutputStreamWriter(httpConn.getOutputStream(), "UTF-8"));
    		out.println(requestData);
    		out.close();
    
    		int responseCode = httpConn.getResponseCode();
    		StringBuffer buffer = new StringBuffer();
    		if (responseCode == 200) {
    			BufferedReader reader = new BufferedReader(new InputStreamReader(httpConn.getInputStream(), "UTF-8"));
    			String line = null;
    			while ((line = reader.readLine()) != null) {
    				buffer.append(line);
    			}
    			reader.close();
    			httpConn.disconnect();
    		}
    		
    		map.put("responseCode", new Integer(responseCode).toString());
    		map.put("value", buffer.toString());
    		return map;
    	}
    
    }
    
    

    原文地址:
    http://wangxin123.com/2016/12/19/使用Post方法模拟登陆爬取网页/

  • 相关阅读:
    BZOJ1954 Pku3764 The xor-longest Path
    BZOJ3697 采药人的路径
    BZOJ1468 Tree
    BZOJ2326 [HNOI2011]数学作业
    BZOJ2809 [Apio2012]dispatching
    BZOJ1334 [Baltic2008]Elect
    BZOJ2882 工艺
    BZOJ3791 作业
    BZOJ1224 [HNOI2002]彩票
    [noip2013]花匠
  • 原文地址:https://www.cnblogs.com/wangxin37/p/6398743.html
Copyright © 2011-2022 走看看