zoukankan      html  css  js  c++  java
  • HttpClient 4.x 执行网站登录并抓取网页的代码

    HttpClient 4.x 的 API 变化还是很大,这段代码可用来执行登录过程,并抓取网页。 
    HttpClient API 文档(4.0.x), HttpCore API 文档(4.1) 

    package spider;
     
    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.util.ArrayList;
    import java.util.List;
     
    import org.apache.commons.io.IOUtils;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpResponse;
    import org.apache.http.NameValuePair;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.*;
    import org.apache.http.impl.client.DefaultHttpClient;
    import org.apache.http.message.BasicNameValuePair;
     
    /**
     * 以 dict.cn 网站为例的爬虫
     * @author Winter Lau
     */
    public class DictSpider {
     
        private final static HttpClient client = new DefaultHttpClient();
         
        public static void main(String[] args) throws IOException {
            login("<用户名>","<密码>", false);
            get("http://www16.dict.cn/bdc/141");
        }
         
        /**
         * 抓取网页
         * @param url
         * @throws IOException
         */
        static void get(String url) throws IOException {
            HttpGet get = new HttpGet(url);
            HttpResponse response = client.execute(get);
            System.out.println(response.getStatusLine());
            HttpEntity entity = response.getEntity();
            dump(entity);
        }
         
        /**
         * 执行登录过程
         * @param user
         * @param pwd
         * @param debug
         * @throws IOException
         */
        static void login(String user, String pwd, boolean debug) throws IOException {
            HttpPost post = new HttpPost("http://dict.cn/login.php");
            post.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3");
             
            //登录表单的信息
            List<NameValuePair> qparams = new ArrayList<NameValuePair>();
            qparams.add(new BasicNameValuePair("username", user));
            qparams.add(new BasicNameValuePair("password", pwd));
            qparams.add(new BasicNameValuePair("url", "http://www16.dict.cn/bdc/141"));
            qparams.add(new BasicNameValuePair("loginforever", "1"));
             
            UrlEncodedFormEntity params = new UrlEncodedFormEntity(qparams, "UTF-8");
            post.setEntity(params);
     
            // Execute the request
            HttpResponse response = client.execute(post);
             
            if(debug){
                // Examine the response status
                System.out.println(response.getStatusLine());
         
                // Get hold of the response entity
                HttpEntity entity = response.getEntity();
                 
                dump(entity);
            }
        }
     
        /**
         * 打印页面
         * @param entity
         * @throws IOException
         */
        private static void dump(HttpEntity entity) throws IOException {
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(entity.getContent(), "GBK"));
             
            System.out.println(IOUtils.toString(br));
        }
         
    }
    

      网址:http://www.oschina.net/code/snippet_12_2209

  • 相关阅读:
    【webpack4x】基本概念
    React平时的一些踩坑总结
    redux-saga学习进阶篇二
    redux-saga学习进阶篇一
    redux-saga学习基础篇一
    性能优化之节流(throttling)与防抖(debounce)
    git操作之发现push到远程仓库的代码有误
    git高级浅入之当我们需要去恢复到某个版本
    git高级浅入之当我们需要修改某次commit信息
    http验证CertificateValidation
  • 原文地址:https://www.cnblogs.com/lr393993507/p/4864761.html
Copyright © 2011-2022 走看看