1、贴出主要代码。这个不是python,python只涉及了服务端对信息提取结果的接受。主体是java + android + js。由于淘宝各模块都是二级子域名,不能只在一个页面完成所有请求,ajax不能跨域。需要加载不同的页面。以下是主要部分。js内容使用服务端分发。
这样做好处,即使不使用微服务,单台机器也能满足1000个用户在同一分钟提交账号密码请求登录,简化后台编写复杂度和减小服务器压力。密码 验证码的校验也更及时。
2、不是爬自己的信息,是获取别人 任意账号 + 密码的淘宝个人信息,如果是为了拿到自己的信息,搞这么多七七八八的那是闲的蛋疼。具体账号 密码是哪来的,置顶第一篇有介绍。
package com.touna.crawlmodule; import android.graphics.Bitmap; import android.net.http.SslError; import android.support.v7.app.AppCompatActivity; import android.os.Bundle; import android.util.Log; import android.view.View; import android.webkit.CookieManager; import android.webkit.JavascriptInterface; import android.webkit.SslErrorHandler; import android.webkit.ValueCallback; import android.webkit.WebChromeClient; import android.webkit.WebSettings; import android.webkit.WebView; import android.webkit.WebViewClient; import org.json.JSONObject; import com.xx.httprequest.CrawlResultSender; import com.xx.view.LogUtil; import com.xx.view.ViewUtil; import com.xx.view.WebViewTimer; public class TaobaoActivity extends AppCompatActivity { private static final String TAG = "MainActivity"; private static final String LOGINPAGEURL = "https://login.m.taobao.com/login.htm";//移动端登陆页面 private static final String MOBILEINDEXPAGEURL = "http://h5.m.taobao.com/mlapp/mytaobao.html";//移动端淘宝个人用户首页 private static final String PCINDEXPAGEURL = "https://www.taobao.com/"; private static final String BINDPAGEURL = "http://member1.taobao.com/member/fresh/account_management.htm"; private static final String COLLECTIONURL = "https://shoucang.taobao.com/nodejs/item_collect_chunk.htm";//收藏页面url private static final String ADDRESSURL = "https://member1.taobao.com/member/fresh/deliver_address.htm";//收货地址url private static final String MYPATHURL = "https://lu.taobao.com/newMyPath.htm";//我的足迹url private static final String BOUGHTSHOPSURL = "https://favorite.taobao.com/list_bought_shops_n.htm";//已经购买的店铺 private static final String BOUGHTITEMSURL = "https://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm";//已经购买的物品 private static final String SHOPCARTURL = "https://cart.taobao.com/cart.htm";//购物车URL private static final String SAFESETTINGURL = "http://member1.taobao.com/member/fresh/certify_info.htm";//安全信息设置 private static final String TRADEINFOURL = "http://member1.taobao.com/member/fresh/account_profile.htm";//交易信息url private static final String PERSONALINFOURL = "https://i.taobao.com/user/baseInfoSet.htm";//个人资料url private static final String POINTSURL = "https://pages.tmall.com/wow/jifen/act/point-details";//积分URL private static final String WEIBOURL = "http://member1.taobao.com/member/fresh/weibo_bind_management.htm";//绑定微博URL private static final String REFUSEURL = "https://refund2.tmall.com/dispute/buyerDisputeList.htm?type=1&disputeType=1";//退货管理URL private static final String HUABEIURL = "https://i.taobao.com/my_taobao.htm";//支付宝余额和花呗额度 private JSONObject dataJson=new JSONObject(); @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_taobo); startWebView(); } private void startWebView() { WebView webView = findViewById(R.id.taobaoView); final WebSettings settings = webView.getSettings(); settings.setUseWideViewPort(true); settings.setLayoutAlgorithm(WebSettings.LayoutAlgorithm.NARROW_COLUMNS); settings.setLoadWithOverviewMode(true); settings.setJavaScriptEnabled(true); webView.addJavascriptInterface(new JsInterface(), "JsInterface"); settings.setJavaScriptEnabled(true); settings.setLoadWithOverviewMode(true); settings.setSupportZoom(true); settings.setDomStorageEnabled(true); settings.setCacheMode(WebSettings.LOAD_NO_CACHE); settings.setAllowFileAccess(true); settings.setUseWideViewPort(true); settings.setSupportMultipleWindows(true); settings.setLoadsImagesAutomatically(true); //settings.setBlockNetworkImage(false); settings.setDefaultTextEncodingName("GBK"); webView.setVerticalScrollBarEnabled(true); webView.setHorizontalScrollBarEnabled(true); settings.setUserAgentString("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); webView.setWebChromeClient(new WebChromeClient()); startWebViewClient(webView); webView.loadUrl(LOGINPAGEURL); } /** * @param view WebView对象 * 初始化webviewClient */ private void startWebViewClient(WebView view) { view.setWebViewClient(new WebViewClient() { @Override public void onReceivedSslError(WebView view, SslErrorHandler handler, SslError error) { handler.proceed(); } @Override public void onPageStarted(final WebView view, String url, Bitmap favicon) { Log.e(TAG, "onPageStarted: " + url); if (url.contains(LOGINPAGEURL)){ view.setVisibility(View.GONE); } } /** * @param view 浏览器对象 * @param url 浏览器地址 */ @Override public void onPageFinished(final WebView view, String url) { Log.e(TAG, "onPageFinished: " + url); if (url.contains(LOGINPAGEURL)) { ViewUtil.injectScriptFile(view, "loginPage/taobaoInit.js"); view.loadUrl("javascript:initLoginPage()"); new WebViewTimer(view, 300){ @Override public void operateView(){ view.setVisibility(View.VISIBLE); } }; } if (url.contains(MOBILEINDEXPAGEURL)) { //view.getSettings().setUserAgentString("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); ViewUtil.setNoImage(view); //关闭图片 view.loadUrl(PCINDEXPAGEURL); } if (PCINDEXPAGEURL.equals(url)) { view.loadUrl(REFUSEURL); } if (url.contains(REFUSEURL)) { ViewUtil.injectScriptFile(view, "jquery.min.js"); //此处需要jquery! ViewUtil.injectScriptFromInternet(view, "taobao/refund.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractRefund());"); view.loadUrl(POINTSURL); } if (url.contains(POINTSURL)) { Log.e(TAG, "onPageFinished: inject"); ViewUtil.injectScriptFromInternet(view, "taobao/point.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractPoint());"); view.loadUrl(PERSONALINFOURL); } if (url.contains(PERSONALINFOURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/personalInformation.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractPersonalInformation());"); view.loadUrl(COLLECTIONURL); } if (url.contains(COLLECTIONURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/collect.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractCollect());"); view.loadUrl(ADDRESSURL); } if (url.contains(ADDRESSURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/delivery.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractDelivery());"); view.loadUrl(MYPATHURL); } if (url.contains(MYPATHURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/footprint.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractFootprint());"); view.loadUrl(BOUGHTSHOPSURL); } if (url.contains(BOUGHTSHOPSURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/havaboughtStore.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractHaveBoughtStore());"); view.loadUrl(BOUGHTITEMSURL); } if (url.contains(BOUGHTITEMSURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/havebought.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractHaveBought());"); view.loadUrl(SHOPCARTURL); } if (url.contains(SHOPCARTURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/shoppingCart.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractShoppingCart());"); view.loadUrl(SAFESETTINGURL); } if (url.contains(SAFESETTINGURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/safeSettings.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractSafeSettings());"); view.loadUrl(TRADEINFOURL); } if (url.contains(TRADEINFOURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/tradeInfo.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractTradeInfo());"); view.loadUrl(WEIBOURL); } if (url.contains(WEIBOURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/weibo.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractWeibo());"); view.loadUrl(BINDPAGEURL); } if (url.contains(BINDPAGEURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/alipayBinding.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractAlipay());"); view.loadUrl(HUABEIURL); } if (url.contains(HUABEIURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/huabei.js"); view.loadUrl("javascript:clickHuabei1()"); new WebViewTimer(view, 2000){ @Override public void operateView(){ view.loadUrl("javascript:clickHuabei2()"); } }; new WebViewTimer(view, 4000){ @Override public void operateView(){ view.evaluateJavascript("extractHuabei()", new ValueCallback<String>() { @Override public void onReceiveValue(String s) { Log.e(TAG, "onReceiveValue: "+s ); String jsonStr = ViewUtil.getStrLikeJson(s); ViewUtil.reconsituteJSon(jsonStr, dataJson); ViewUtil.showLargeLog(dataJson.toString()); CrawlResultSender.sendToweb("taobao", dataJson.toString()); } }); } }; } } }); } class JsInterface { private static final String TAG = "JSInterface"; @JavascriptInterface public void getReturnString(String returnValue) throws Exception{ Log.e(TAG,"当前项返回值是: " + returnValue); ViewUtil.reconsituteJSon(returnValue,dataJson); } } }
贴出其中一个js实例,例如提取用户所收藏物品。这里不是用直接翻页,使用的是ajax以提升效率,ajax一定需要同步方式。由于此接口是返回的页面不是json,可以用css选择器。
1 /** 2 * Created by ㄟ(▔=▔)ㄏ on 2018/1/5. 3 */ 4 /* 5 * https://shoucang.taobao.com/nodejs/item_collect_chunk.htm?ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow=0 6 * 提取收藏的宝贝 7 * */ 8 function myajax(opt) { 9 opt = opt || {}; 10 opt.type = opt.type.toUpperCase() || 'POST'; 11 opt.url = opt.url || ''; 12 opt.async = opt.async || false; 13 opt.data = opt.data || null; 14 opt.success = opt.success || function () {}; 15 var xmlHttp = null; 16 if (XMLHttpRequest) { 17 xmlHttp = new XMLHttpRequest(); 18 } 19 else { 20 xmlHttp = new ActiveXObject('Microsoft.XMLHTTP'); 21 } 22 var params = []; 23 for (var key in opt.data){ 24 params.push(key + '=' + opt.data[key]); 25 } 26 var postData = params.join('&'); 27 if (opt.type.toUpperCase() === 'POST') { 28 xmlHttp.open(opt.type, opt.url, opt.async); 29 xmlHttp.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded;charset=utf-8'); 30 xmlHttp.send(postData); 31 } 32 else if (opt.type.toUpperCase() === 'GET') { 33 xmlHttp.open(opt.type, opt.url + '?' + postData, opt.async); 34 xmlHttp.send(null); 35 } 36 return xmlHttp; 37 } 38 39 40 function extractCollect() { 41 42 var collectList = []; 43 function extractCollectInner(p) { 44 console.debug("当前是第 " + p + "页"); 45 var p = p || 0; 46 var startRow = p*30 ; 47 var url = 'https://shoucang.taobao.com/nodejs/item_collect_chunk.htm?ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow='+ startRow; 48 var htmlObj = myajax({ 49 type: 'GET', 50 url: url , 51 async: false 52 }); 53 var htmlStr = htmlObj.responseText; 54 if (htmlStr.indexOf("J_FavListItem") > 0) { //判断页面是否为空不能继续翻页了 55 var collectSelectorList = document.querySelectorAll('li.J_FavListItem'); //使用原生js的querySelector css选择器方法 56 for (var i=0; i< collectSelectorList.length; i++) { 57 console.debug(i); 58 var collectName = collectSelectorList[i].querySelector('a.img-item-title-link').title; 59 var collectUrl = collectSelectorList[i].querySelector('a.img-item-title-link').href; 60 var collectPriceElement = collectSelectorList[i].querySelector('.g_price strong'); 61 collectPriceElement ? collectPrice = collectPriceElement.innerText : collectPrice = "宝贝已失效"; //三元运算符,找不到价格元素,说明该宝贝已失效 62 var collectObj = {'collectName': collectName, 'collectUrl': collectUrl,'collectPrice':collectPrice}; 63 console.info(collectObj); 64 collectList.push(collectObj); 65 } 66 console.info(url); 67 if (p < 3){ //最多只翻3页,每页30个收藏 68 extractCollectInner(p + 1); //翻页回调自己 69 } 70 } 71 return '{"collectInfo":' + JSON.stringify(collectList) + '}'; 72 } 73 74 return extractCollectInner(); 75 } 76 77 //extractCollect();
这就是唯一登录淘宝获取信息的方法,不管是什么语言java py,不管是用httpclient urlconnection还是urllib requests 想达到 本篇的目的,可能性为0。不服不信的可以用httpclient urllib试试,光是一个接口登录淘宝,网上就在悬赏5万人民币了,就不说提取信息了,单是把这个接口登录淘宝解决,相当于几个月的工资了。