zoukankan      html  css  js  c++  java
  • Java模拟登陆新浪微博抓取数据【转载】

      1 package com.shiyimm.crawler.weibo;
      2 
      3 import java.io.FileNotFoundException;
      4 import java.io.FileReader;
      5 import java.io.IOException;
      6 import java.io.UnsupportedEncodingException;
      7 import java.net.URLDecoder;
      8 import java.net.URLEncoder;
      9 import java.util.ArrayList;
     10 import java.util.Date;
     11 import java.util.List;
     12 import java.util.regex.Matcher;
     13 import java.util.regex.Pattern;
     14 
     15 import javax.script.Invocable;
     16 import javax.script.ScriptEngine;
     17 import javax.script.ScriptEngineManager;
     18 import javax.script.ScriptException;
     19 
     20 import net.sf.json.JSONObject;
     21 
     22 import org.apache.commons.codec.binary.Base64;
     23 import org.apache.http.NameValuePair;
     24 import org.apache.http.client.ClientProtocolException;
     25 import org.apache.http.client.HttpClient;
     26 import org.apache.http.impl.client.DefaultHttpClient;
     27 import org.apache.http.message.BasicNameValuePair;
     28 
     29 import com.shiyimm.crawler.util.MyUrlUtil;
     30 import com.shiyimm.crawler.util.UrlUtil;
     31 
     32 public class SinaWeibo {
     33     private HttpClient client;
     34     private String username;    //登录帐号(明文)
     35     private String password;    //登录密码(明文)
     36     private String su;            //登录帐号(Base64加密)
     37     private String sp;            //登录密码(各种参数RSA加密后的密文)
     38     private long servertime;    //初始登录时,服务器返回的时间戳,用以密码加密以及登录用
     39     private String nonce;        //初始登录时,服务器返回的一串字符,用以密码加密以及登录用
     40     private String rsakv;        //初始登录时,服务器返回的一串字符,用以密码加密以及登录用
     41     private String pubkey;        //初始登录时,服务器返回的RSA公钥
     42     
     43     private String errInfo;        //登录失败时的错误信息
     44     private String location;    //登录成功后的跳转连接
     45     
     46     private String url;
     47     
     48     public SinaWeibo(String username,String password){
     49         client = new DefaultHttpClient();
     50         this.username = username;
     51         this.password = password;
     52     }
     53     
     54     
     55     /**
     56      * 初始登录信息<br>
     57      * 返回false说明初始失败
     58      * @return
     59      */
     60     public boolean preLogin(){
     61         boolean flag = false;
     62         try {
     63             su = new String(Base64.encodeBase64(URLEncoder.encode(username, "UTF-8").getBytes()));
     64             String url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&rsakt=mod&checkpin=1&" +
     65                     "client=ssologin.js(v1.4.5)&_="+getTimestamp();
     66             url += "&su="+su;
     67             String content;
     68             content = HttpTools.getRequest(client, url);
     69             //System.out.println(content);
     70             System.out.println("content------------"+content);
     71             JSONObject json = JSONObject.fromObject(content);
     72             System.out.println(json);
     73             servertime = json.getLong("servertime");
     74             nonce = json.getString("nonce");
     75             rsakv = json.getString("rsakv");
     76             pubkey = json.getString("pubkey");
     77             flag = encodePwd();
     78         } catch (UnsupportedEncodingException e) {
     79             // TODO Auto-generated catch block
     80             //e.printStackTrace();
     81         } catch (ClientProtocolException e) {
     82             // TODO Auto-generated catch block
     83             //e.printStackTrace();
     84         } catch (IOException e) {
     85             // TODO Auto-generated catch block
     86             //e.printStackTrace();
     87         }
     88         return flag;
     89     }
     90     
     91     /**
     92      * 登录
     93      * @return true:登录成功
     94      */
     95     public boolean login(){
     96         if(preLogin()){
     97             String url = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)";
     98             List<NameValuePair> parms = new ArrayList<NameValuePair>();
     99             parms.add(new BasicNameValuePair("entry", "weibo"));
    100             parms.add(new BasicNameValuePair("geteway", "1"));
    101             parms.add(new BasicNameValuePair("from", ""));
    102             parms.add(new BasicNameValuePair("savestate", "7"));
    103             parms.add(new BasicNameValuePair("useticket", "1"));
    104             parms.add(new BasicNameValuePair("pagerefer", "http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%2F"));
    105             parms.add(new BasicNameValuePair("vsnf", "1"));
    106             parms.add(new BasicNameValuePair("su", su));
    107             parms.add(new BasicNameValuePair("service", "miniblog"));
    108             parms.add(new BasicNameValuePair("servertime", servertime+""));
    109             parms.add(new BasicNameValuePair("nonce", nonce));
    110             parms.add(new BasicNameValuePair("pwencode", "rsa2"));
    111             parms.add(new BasicNameValuePair("rsakv", rsakv));
    112             parms.add(new BasicNameValuePair("sp", sp));
    113             parms.add(new BasicNameValuePair("encoding", "UTF-8"));
    114             parms.add(new BasicNameValuePair("prelt", "182"));
    115             parms.add(new BasicNameValuePair("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack"));
    116             parms.add(new BasicNameValuePair("returntype", "META"));
    117             try {
    118                 String content = HttpTools.postRequest(client, url, parms);
    119                 System.out.println("content----------"+content);
    120                 String regex = "location\.replace\("(.+?)"\);";
    121                 Pattern p = Pattern.compile(regex);
    122                 Matcher m = p.matcher(content);
    123                 if(m.find()){
    124                     location = m.group(1);
    125                     if(location.contains("reason=")){
    126                         errInfo = location.substring(location.indexOf("reason=")+7);
    127                         errInfo = URLDecoder.decode(errInfo, "GBK");
    128                     }else{
    129                         String result = HttpTools.getRequest(client, location);
    130                         System.out.println("result--------------"+result);
    131                         return true;
    132                     }
    133                 }
    134             } catch (ClientProtocolException e) {
    135                 // TODO Auto-generated catch block
    136                 e.printStackTrace();
    137             } catch (IOException e) {
    138                 // TODO Auto-generated catch block
    139                 e.printStackTrace();
    140             }
    141 //            url = "http://www.weibo.com/hm";
    142 //            System.out.println(MyUrlUtil.getResource(url));
    143         }
    144         return false;
    145     }
    146     
    147     /**
    148      * 密码进行RSA加密&lt;br&gt;
    149      * 返回false说明加密失败
    150      * @return
    151      */
    152     private boolean encodePwd(){
    153         ScriptEngineManager sem = new ScriptEngineManager();
    154         ScriptEngine se = sem.getEngineByName("javascript");
    155         try {
    156             FileReader fr = new FileReader("E:\encoder.js");
    157             se.eval(fr);
    158             Invocable invocableEngine = (Invocable) se;
    159             String callbackvalue = (String) invocableEngine.invokeFunction("encodePwd",pubkey,servertime,nonce,password);
    160             sp = callbackvalue;
    161             return true;
    162         } catch (FileNotFoundException e) {
    163             // TODO Auto-generated catch block
    164             System.out.println("加密脚本encoder.sj未找到");
    165         } catch (ScriptException e) {
    166             // TODO Auto-generated catch block
    167             //e.printStackTrace();
    168         } catch (NoSuchMethodException e) {
    169             // TODO Auto-generated catch block
    170             //e.printStackTrace();
    171         }
    172         errInfo = "密码加密失败!";
    173         return false;
    174     }
    175     
    176     public String getErrInfo() {
    177         return errInfo;
    178     }
    179     
    180     /**
    181      * 获取时间戳
    182      * @return
    183      */
    184     private long getTimestamp(){
    185         Date now = new Date();
    186         return now.getTime();
    187     }
    188     
    189     public static void main(String[] args) throws ClientProtocolException, IOException {
    190         SinaWeibo weibo = new SinaWeibo("账号", "密码");
    191         if(weibo.login()){
    192             System.out.println("登陆成功!");
    193             String url = "http://www.weibo.com/hm";
    194 //            String source = MyUrlUtil.getResource(url);
    195 //            System.out.println(source);
    196         }else{
    197             System.out.println("登录失败!");
    198         }
    199     }
    200 }
    201 <pre class="java; toolbar: true; auto-links: false;">package com.shiyimm.crawler.weibo;
    202 
    203 import java.io.IOException;
    204 import java.util.List;
    205 
    206 import org.apache.http.HttpEntity;
    207 import org.apache.http.HttpResponse;
    208 import org.apache.http.NameValuePair;
    209 import org.apache.http.client.ClientProtocolException;
    210 import org.apache.http.client.HttpClient;
    211 import org.apache.http.client.entity.UrlEncodedFormEntity;
    212 import org.apache.http.client.methods.HttpGet;
    213 import org.apache.http.client.methods.HttpPost;
    214 import org.apache.http.util.EntityUtils;
    215 
    216 public class HttpTools {
    217     /**
    218      * 正常GET方式HTTP请求
    219      * @param client
    220      * @param url
    221      * @return
    222      * @throws ClientProtocolException
    223      * @throws IOException
    224      */
    225     public static String getRequest(HttpClient client,String url) throws ClientProtocolException, IOException{
    226         HttpGet get = new HttpGet(url);
    227         get.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11");
    228         //get.addHeader(&quot;Referer&quot;, &quot;http://2013.weibo.com/&quot;);
    229         HttpResponse response = client.execute(get);
    230         HttpEntity entity = response.getEntity();
    231         String content = EntityUtils.toString(entity,"GBK");
    232         //System.out.println(content);
    233         /*EntityUtils.consume(entity);*/
    234         return content;
    235     }
    236     
    237     /**
    238      * 正常POST方式HTTP请求
    239      * @param client
    240      * @param url
    241      * @param parms
    242      * @return
    243      * @throws ClientProtocolException
    244      * @throws IOException
    245      */
    246     public static String postRequest(HttpClient client,String url,List<NameValuePair> parms) throws ClientProtocolException, IOException{
    247         HttpPost post = new HttpPost(url);
    248         post.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11");
    249         post.addHeader("Content-Type", "application/x-www-form-urlencoded");
    250         //post.addHeader(&quot;Referer&quot;, &quot;http://2013.weibo.com/&quot;);
    251         UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(parms, "UTF-8");
    252         post.setEntity(postEntity);
    253         HttpResponse response = client.execute(post);
    254         HttpEntity entity = response.getEntity();
    255         String content = EntityUtils.toString(entity,"GBK");
    256         /*EntityUtils.consume(entity);*/
    257         return content;
    258     }
    259 }
  • 相关阅读:
    各种排序算法java实现,好文,做个备份
    一个SQL语句
    StrutsLayout tag library 1.1发布
    “单击将本站加入收藏夹”的代码
    IE无法上网连接的解决办法
    字符问题!
    Time Tracker Starter Kit 简介
    谈恋爱是百年好合的事
    一个sql语句
    ASP.NET 中的自定义脚本回调
  • 原文地址:https://www.cnblogs.com/dekevin/p/3581386.html
Copyright © 2011-2022 走看看