zoukankan      html  css  js  c++  java
  • 爬虫:爬取海词的翻译内容

    在爬取海词的时候遇到了一个问题,在异步加载的时候,需要一个t值,但是这个t值是js加载出来的,直接拼接的,我们无法从网页中得到;

     当在搜索框输入单词的时候:你在干嘛

    搜索

    替换下图中的page的值就能达到翻页的目的:

    那么当前的目的就是要能够找到这段js代码,同时获取对应输入的t的值,来重新组合url

    真正的url只需要如下内容:

    我提前把关键字和t都处理了,写成了字典的形式,

    key:你在干嘛  ff[key]:WuqarCRs

    {“你好”:“WuqarCRs”}  #提前处理成了这种,方便提取
    url = "http://fuzz.dict.cn/dict/api.php?&action=fuzz&from=jsonp&q=" + key + "&t="+ ff[key]+"&page="

    那么关键部分来了,我是如何获取T的呢。

    大概思路,是在本地搭建一个服务器,然后输入每个词去访问这段js代码,然后返回给词的结果保存起来。

    处理过程的代码:

    第一步:先找到那段js代码,里面是如何把输入的文字转换成8位字符串的算法

    第二步:先安装node.js 服务器,然后提取出来这段js代码,转换成node.js代码,如果不转换的话在浏览器里面直接访问时无法触发js加载的。

    下面是node.js的代码,先执行node.js代码

    var http = require('http');
    var querystring = require('querystring');
    var util = require('util');
    
    http.createServer(function(req, res){
        var post = '';     
        var mm = ''
        
        
    function dictCrypto(J) {
      function r(g, f) {
        var e, d, a, b, c;
        a = g & 2147483648;
        b = f & 2147483648;
        e = g & 1073741824;
        d = f & 1073741824;
        c = (g & 1073741823) + (f & 1073741823);
        if (e & d) {
          return c ^ 2147483648 ^ a ^ b
        }
        return e | d ? c & 1073741824 ? c ^ 3221225472 ^ a ^ b : c ^ 1073741824 ^ a ^ b : c ^ a ^ b
      }
    
      function I(g, f, e, d, a, b, c) {
        g = r(g, r(r(f & e | ~f & d, a), c));
        return r(g << b | g >>> 32 - b, f)
      }
    
      function s(g, f, e, d, a, b, c) {
        g = r(g, r(r(f & d | e & ~d, a), c));
        return r(g << b | g >>> 32 - b, f)
      }
    
      function w(g, f, e, d, a, b, c) {
        g = r(g, r(r(f ^ e ^ d, a), c));
        return r(g << b | g >>> 32 - b, f)
      }
    
      function v(g, f, e, d, a, b, c) {
        g = r(g, r(r(e ^ (f | ~d), a), c));
        return r(g << b | g >>> 32 - b, f)
      }
    
      function K(c) {
        for (var b = "++"; c > 0;) {
          var a = c % 64;
          b += a == 0 ? "+" : a == 1 ? "-" : a > 1 && a < 12 ? String.fromCharCode(a + 46) : a > 11 && a < 38 ? String.fromCharCode(a + 54) : String.fromCharCode(a + 59);
          c = (c - a) / 64
        }
        return b.substr(b.length - 2, 2)
      }
    
      function H(d) {
        var c = "",
          b = "",
          a;
        for (a = 0; a <= 3; a++) {
          b = d >>> a * 8 & 255;
          b = "0" + b.toString(16);
          c += b.substr(b.length - 2, 2)
        }
        return c
      }
      var x = [],G, L, q, p, F, E, D, C;
      J = function(d) {
          // var rrr = //;
          // d = d.replace(rrr,"");
          process.stdout.write(d +"***"+'
    ');
          process.stdout.write(typeof rrr);
        d = d.replace(/
    /g, "
    ");
    
        for (var c = "",b = 0; b < d.length; b++) {
          var a = d.charCodeAt(b);
          if (a < 128) {
            c += String.fromCharCode(a)
          } else {
            if (a > 127 && a < 2048) {
              c += String.fromCharCode(a >> 6 | 192)
            } else {
              c += String.fromCharCode(a >> 12 | 224);
              c += String.fromCharCode(a >> 6 & 63 | 128)
            }
            c += String.fromCharCode(a & 63 | 128)
          }
        }
        c += String.fromCharCode(80, 97, 83, 115);
        if (global.dict_pagetoken) {
          c += global.dict_pagetoken
        }
        return c
      }(J);
      x = function(g) {
        var f, e = g.length;
        f = e + 8;
        for (var d = ((f - f % 64) / 64 + 1) * 16, a = Array(d - 1), b = 0, c = 0; c < e;) {
          f = (c - c % 4) / 4;
          b = c % 4 * 8;
          a[f] |= g.charCodeAt(c) << b;
          c++
        }
        a[(c - c % 4) / 4] |= 128 << c % 4 * 8;
        a[d - 2] = e << 3;
        a[d - 1] = e >>> 29;
        return a
      }(J);
      F = 1732584193;
      E = 4023233417;
      D = 2562383102;
      C = 271733878;
      for (J = 0; J < x.length; J += 16) {
        G = F;
        L = E;
        q = D;
        p = C;
        F = I(F, E, D, C, x[J + 0], 7, 3614090360);
        C = I(C, F, E, D, x[J + 1], 12, 3905402710);
        D = I(D, C, F, E, x[J + 2], 17, 606105819);
        E = I(E, D, C, F, x[J + 3], 22, 3250441966);
        F = I(F, E, D, C, x[J + 4], 7, 4118548399);
        C = I(C, F, E, D, x[J + 5], 12, 1200080426);
        D = I(D, C, F, E, x[J + 6], 17, 2821735955);
        E = I(E, D, C, F, x[J + 7], 22, 4249261313);
        F = I(F, E, D, C, x[J + 8], 7, 1770035416);
        C = I(C, F, E, D, x[J + 9], 12, 2336552879);
        D = I(D, C, F, E, x[J + 10], 17, 4294925233);
        E = I(E, D, C, F, x[J + 11], 22, 2304563134);
        F = I(F, E, D, C, x[J + 12], 7, 1804603682);
        C = I(C, F, E, D, x[J + 13], 12, 4254626195);
        D = I(D, C, F, E, x[J + 14], 17, 2792965006);
        E = I(E, D, C, F, x[J + 15], 22, 1236535329);
        F = s(F, E, D, C, x[J + 1], 5, 4129170786);
        C = s(C, F, E, D, x[J + 6], 9, 3225465664);
        D = s(D, C, F, E, x[J + 11], 14, 643717713);
        E = s(E, D, C, F, x[J + 0], 20, 3921069994);
        F = s(F, E, D, C, x[J + 5], 5, 3593408605);
        C = s(C, F, E, D, x[J + 10], 9, 38016083);
        D = s(D, C, F, E, x[J + 15], 14, 3634488961);
        E = s(E, D, C, F, x[J + 4], 20, 3889429448);
        F = s(F, E, D, C, x[J + 9], 5, 568446438);
        C = s(C, F, E, D, x[J + 14], 9, 3275163606);
        D = s(D, C, F, E, x[J + 3], 14, 4107603335);
        E = s(E, D, C, F, x[J + 8], 20, 1163531501);
        F = s(F, E, D, C, x[J + 13], 5, 2850285829);
        C = s(C, F, E, D, x[J + 2], 9, 4243563512);
        D = s(D, C, F, E, x[J + 7], 14, 1735328473);
        E = s(E, D, C, F, x[J + 12], 20, 2368359562);
        F = w(F, E, D, C, x[J + 5], 4, 4294588738);
        C = w(C, F, E, D, x[J + 8], 11, 2272392833);
        D = w(D, C, F, E, x[J + 11], 16, 1839030562);
        E = w(E, D, C, F, x[J + 14], 23, 4259657740);
        F = w(F, E, D, C, x[J + 1], 4, 2763975236);
        C = w(C, F, E, D, x[J + 4], 11, 1272893353);
        D = w(D, C, F, E, x[J + 7], 16, 4139469664);
        E = w(E, D, C, F, x[J + 10], 23, 3200236656);
        F = w(F, E, D, C, x[J + 13], 4, 681279174);
        C = w(C, F, E, D, x[J + 0], 11, 3936430074);
        D = w(D, C, F, E, x[J + 3], 16, 3572445317);
        E = w(E, D, C, F, x[J + 6], 23, 76029189);
        F = w(F, E, D, C, x[J + 9], 4, 3654602809);
        C = w(C, F, E, D, x[J + 12], 11, 3873151461);
        D = w(D, C, F, E, x[J + 15], 16, 530742520);
        E = w(E, D, C, F, x[J + 2], 23, 3299628645);
        F = v(F, E, D, C, x[J + 0], 6, 4096336452);
        C = v(C, F, E, D, x[J + 7], 10, 1126891415);
        D = v(D, C, F, E, x[J + 14], 15, 2878612391);
        E = v(E, D, C, F, x[J + 5], 21, 4237533241);
        F = v(F, E, D, C, x[J + 12], 6, 1700485571);
        C = v(C, F, E, D, x[J + 3], 10, 2399980690);
        D = v(D, C, F, E, x[J + 10], 15, 4293915773);
        E = v(E, D, C, F, x[J + 1], 21, 2240044497);
        F = v(F, E, D, C, x[J + 8], 6, 1873313359);
        C = v(C, F, E, D, x[J + 15], 10, 4264355552);
        D = v(D, C, F, E, x[J + 6], 15, 2734768916);
        E = v(E, D, C, F, x[J + 13], 21, 1309151649);
        F = v(F, E, D, C, x[J + 4], 6, 4149444226);
        C = v(C, F, E, D, x[J + 11], 10, 3174756917);
        D = v(D, C, F, E, x[J + 2], 15, 718787259);
        E = v(E, D, C, F, x[J + 9], 21, 3951481745);
        F = r(F, G);
        E = r(E, L);
        D = r(D, q);
        C = r(C, p)
      }
      return function(d) {
          var c = parseInt("0x" + d.substr(0, 3), 16),
            b = parseInt("0x" + d.substr(3, 3), 16),
            a = parseInt("0x" + d.substr(6, 3), 16);
          d = parseInt("0x" + d.substr(9, 3), 16);
          return K(c) + K(b) + K(a) + K(d);
          console.log(K(c) + K(b) + K(a) + K(d))
        }
        (H(F).substr(0, 4) + H(E).substr(0, 4) + H(D).substr(0, 4))
    }
      //传过来的时候,chunk = “你好”
        req.on('data', function(chunk){    #添加post请求
            process.stdout.write(chunk+'
    ');
            // // process.stdout.write(hh + '
    ');
            // var hhh = "你好"
            rrr = chunk.toString()
            process.stdout.write(typeof rrr  + '
    ');
    
            process.stdout.write(rrr+'
    ');
            mm = dictCrypto(rrr);
            post +=mm;
        });
    
        req.on('end', function(){    
            post = querystring.parse(post);
            res.end(util.inspect(post));
        });
    }).listen(8888);
    
    console.log('Server running at http://127.0.0.1:8888/');
    View Code

    第三步:正常的python代码,去访问本地的服务器,直接把转换完的数据存储到本地

    #! /usr/bin/env python
    #coding: utf-8
    import re
    import os
    import requests
    import sys
    import json
    reload(sys)
    sys.setdefaultencoding('utf-8')
    path = "D:\106_data\juhai_data\"
    ff = open(path + "answer_1.txt",'a')
    f = open("data_1.dict")   #这个是你的词典,按照行来访问词典
    tt = {}
    i = 1
    j = 1
    
    s = requests.session()
    s.keep_alive = False
    
    while 1:
        word = f.readline()
        if not word:
            ans = json.dumps(tt)
            ff.write(ans)
            break
        print word,
        if (i%100000 == 0):#一万个词存储一次,存的格式为字典
            j = j + 1
            ans = json.dumps(tt)
            ff.write(ans)
            ff.close()
            ff = open(path + "answer_" +str(j) + ".txt",'a')
            tt = {}
        word = word.strip('
    ')
        html = requests.post("http://127.0.0.1:8888/",data =word,headers={'Connection':'close'})
        print html.text
        xx = re.search("{ (.*?): '' }",html.text,re.S)#用到了正则去提取内容
        try:
            xx = xx.group(1)
            xx = xx.strip("'")
        except:
            continue
        tt[word] = xx
        print xx
        i = i+1
        s = requests.session()
        s.keep_alive = False
    ff.close()
    f.close()
  • 相关阅读:
    二叉搜索树的建树与遍历
    SpringBoot Mybatis 读写分离配置(山东数漫江湖)
    Spring注解概览(数漫江湖)
    Spring Cloud的基本认识和使用Spring Cloud的基本教程(山东数漫江湖)
    全面了解Nginx主要应用场景(数漫江湖)
    Spring boot集成RabbitMQ(山东数漫江湖)
    Spring 事务管理(山东数漫江湖)
    透彻理解Spring事务设计思想之手写实现(山东数漫江湖)
    Spring整合Quartz分布式调度(山东数漫江湖)
    Spring归纳小结(山东数漫江湖)
  • 原文地址:https://www.cnblogs.com/lovychen/p/5780549.html
Copyright © 2011-2022 走看看