zoukankan      html  css  js  c++  java
  • kaggle竞赛全收录app开发规划 标签: apipythonkaggle谷歌翻译网页抓取 2017-04-29 09:31 218人阅读

    前言

    作为英语不算好; 又想离线看到所有比赛的中文视图; 不得已必须利用爬虫将其下载下来规划到本地;

    开发过程

    • 1, 要能够离线进行google翻译
    • 2, 提供各个详情页重要数据的预览功能
    • 3, 利用解析网页工具得到md格式便于预览
    • 4, 存储数据到数据库
    • 5, 每个竞赛的 kenerl / discuss 高vote 进行爬取记录

    目前完成阶段

    1, 利用google翻译API模拟浏览器进行翻译

    2, 竞赛主页的数据爬取

    以上内容花了我整整一天的内容; 写出来简单, 但是每个函数的衔接, bug调试等都要花非常多的精力; 特别是google翻译,网上几乎没有成熟的版本

    google翻译代码片

    难点介绍

    • 1, 每个google翻译的结果通过抓包发现网页都会产生一个 tk; 这个tk是动态的, 这个通过网上的版本利用 pyexecjs 包进行JS代码编译。
    • 2, 解析返回结果的 url; 注意是result[4,end]
    • 3, open_url 这个比较简单。模拟浏览器也不算难点; 略
    # -*- coding: utf-8 -*-
    
    import execjs
    
    class GoogleTranslaterTk():
        def __init__(self):
            self.ctx = execjs.compile("""
            function TL(a) {
            var k = "";
            var b = 406644;
            var b1 = 3293161072;
    
            var jd = ".";
            var $b = "+-a^+6";
            var Zb = "+-3^+b+-f";
    
            for (var e = [], f = 0, g = 0; g < a.length; g++) {
                var m = a.charCodeAt(g);
                128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
                e[f++] = m >> 18 | 240,
                e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
                e[f++] = m >> 6 & 63 | 128),
                e[f++] = m & 63 | 128)
            }
            a = b;
            for (f = 0; f < e.length; f++) a += e[f],
            a = RL(a, $b);
            a = RL(a, Zb);
            a ^= b1 || 0;
            0 > a && (a = (a & 2147483647) + 2147483648);
            a %= 1E6;
            return a.toString() + jd + (a ^ b)
        };
    
        function RL(a, b) {
            var t = "a";
            var Yb = "+";
            for (var c = 0; c < b.length - 2; c += 3) {
                var d = b.charAt(c + 2),
                d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
                d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
                a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
            }
            return a
        }
        """)
    
        def getTk(self, text):
            return self.ctx.call("TL", text)
    
    import urllib.request
    
    def open_url(url): #模拟浏览器解析网页
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=url, headers=headers)  #python2,urllib.request()
        response = urllib.request.urlopen(req)                  #python2,urllib2.urlopen()
        data = response.read().decode('utf-8')
        return data
    
    def translate(content, tk):
        if len(content) > 4891: ##这里可以用try
            print("翻译长度过长;请注意分割")
            return
    
        content = urllib.parse.quote(content)
    
        url = "http://translate.google.cn/translate_a/single?client=t" 
              "&sl=en&tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" 
              "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" 
              "&srcrom=0&ssel=0&tsel=0&kc=2&tk={}&q={}".format(tk, content)
    
        result = open_url(url)
    
        end = result.find("",")
        if end > 4:
            return result[4:end]
    
    def tranEn2Cn(content):
        js = GoogleTranslaterTk()
        return translate(content, js.getTk(content))
    

    test.py

    from translate_goole_sy import tranEn2Cn
    
    print (tranEn2Cn("what are you want to do?!"))
    def test2(string):
        ls = string.split('
    ')
        with open('d:\txt','w+') as f:
            for i in ls:
                if(not None):
                    f.writelines(tranEn2Cn(i))
            f.close()
    

    kaggle主页竞赛预览

    没什么难点; 主要是按格式解析内容; 花了挺长时间的
    - 非贪婪匹配内容; df 数据写入。
    - 注意下面的哪些id的内容; 和 columuns 实际是可以动态产生的;

    import urllib.request as ur
    
    def open_url(url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = ur.Request(url=url, headers=headers)  # python2,urllib.request()
        response = ur.urlopen(req)  # python2,urllib2.urlopen()
        return response.read().decode('utf-8')
    
    '''<a class="block-link__anchor" href="/c/"
    "intel-mobileodt-cervical-cancer-screening"></a>'''
    
    import pandas as pd
    import numpy as np
    import re
    import json
    
    def demo():
        global n
        regex = '{"competitionId":(.*?),' 
                '"competitionTitle":(.*?),' 
                '"competitionDescription":(.*?),' 
                '"competitionUrl":(.*?),' 
                '"thumbnailImageUrl":(.*?),' 
                '"deadline":(.*?),' 
                '"totalTeams":(.*?),' 
                '"totalKernels":(.*?),' 
                '"rewardQuantity":(.*?),' 
                '"rewardTypeName":(.*?),' 
                '"organizationName":(.*?),' 
                '"organizationUrl":(.*?),' 
                '"hostSegment":(.*?),' 
                '"isLimited":(.*?),' 
                '"isPrivate":(.*?),' 
                '"isInClass":(.*?),' 
                '"userHasEntered":(.*?),' 
                '"rewardDisplay":(.*?)}'
    
        columns = [ "competitionId","competitionTitle","competitionDescription",
                     "competitionUrl","thumbnailImageUrl","deadline","totalTeams","totalKernels","rewardQuantity",
                    "rewardTypeName","organizationName","organizationUrl","hostSegment","isLimited","isPrivate",
                    "isInClass","userHasEntered","rewardDisplay"]
        n = len(columns)
    
        ls = []
        for i_pageNum in np.arange(1,15):
            url2 = "https://www.kaggle.com/competitions?sortBy=deadline&group=all&page=" + 
                   str(i_pageNum) + "&segment=allCategories"
    
            data = open_url(url2)
            lis = re.findall(regex, data)
            for x in lis:
                ls.append(list(x))
    
        num = 1
        for i in ls:
            print ("------", num,"--------")
            print (i)
            num += 1
    
        df = pd.DataFrame(ls, columns=columns)
        print (df)
        df.to_csv("C:\Users\actanble\Desktop\de.csv")
    
    if __name__ == "__main__":
        demo()

    后记

    今天就回家去了, 后面3-4天都没时间弄这个了, 本来说一鼓作气, 两三天弄好的…

    实际上, 仔细一想; 官网的数据 排版和观看实际上都是特别方便; 这个数据取下来主要目的是能够利用数据进行一些快速查阅; 这个效用并不是很高。

  • 相关阅读:
    java并发5-volatile关键字解析
    java并发4-单例设计方法
    Java并发3-多线程面试题
    JAVA并发2
    JAVA并发
    2015第27周三Java内存模型
    同一时候使用windows和linux系统
    深入浅出Windows BATCH
    DrawText的使用
    redmine忘记username和password
  • 原文地址:https://www.cnblogs.com/actanble/p/7128676.html
Copyright © 2011-2022 走看看