zoukankan      html  css  js  c++  java
  • Python 爬基金数据

    爬科学基金共享服务网中基金数据

    #coding=utf-8
    import json
    import requests
    from lxml import etree
    from HTMLParser import HTMLParser
    from pymongo import MongoClient
    
    data = {'pageSize':10,'currentPage':1,'fundingProject.projectNo':'','fundingProject.name':'','fundingProject.person':'','fundingProject.org':'',
    'fundingProject.applyCode':'','fundingProject.grantCode':'','fundingProject.subGrantCode':'','fundingProject.helpGrantCode':'','fundingProject.keyword':'',
    'fundingProject.statYear':'','checkCode':'%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81'}
    url = 'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action'
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Content-Length':'340',
    'Content-Type':'application/x-www-form-urlencoded',
    'Cookie':'JSESSIONID=8BD27CE37366ED8022B42BFC68FF82D4',
    'Host':'npd.nsfc.gov.cn',
    'Origin':'http://npd.nsfc.gov.cn',
    'Referer':'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    
    def main():
        client = MongoClient('localhost', 27017)
        db = client.ScienceFund
        db.authenticate("","")
        collection=db.science_fund
        for i in range(1, 43184):
            print i
            data['currentPage'] = i
            result = requests.post(url, data = data, headers = headers)
            html = result.text
            tree = etree.HTML(html)
            table = tree.xpath("//dl[@class='time_dl']")
            for item in table:
                content = etree.tostring(item, method='html')
                content =  HTMLParser().unescape(content)
                # print content
                bson = jiexi(content)
                collection.insert(bson)
    
            
    def jiexi(content):
        # 标题
        title1 = content.find('">', 20)
        title2 = content.find('</')
        title = content[title1+2:title2]
        # print title
        # 批准号
        standard_no1 = content.find(u'批准号', title2)
        standard_no2 = content.find('</dd>', standard_no1)
        standard_no = content[standard_no1+4:standard_no2].strip()
        # print standard_no
        # 项目类别
        standard_type1 = content.find(u'项目类别', standard_no2)
        standard_type2 = content.find('</dd>', standard_type1)
        standard_type = content[standard_type1+5:standard_type2].strip()
        # print standard_type
        # 依托单位
        supporting_institution1 = content.find(u'依托单位', standard_type2)
        supporting_institution2= content.find('</dd>', supporting_institution1)
        supporting_institution = content[supporting_institution1+5:supporting_institution2].strip()
        # print supporting_institution
        # 项目负责人
        project_principal1 = content.find(u'项目负责人', supporting_institution2)
        project_principal2 = content.find('</dd>', project_principal1)
        project_principal = content[project_principal1+6:project_principal2].strip()
        # print project_principal
        # 资助经费
        funds1 = content.find(u'资助经费', project_principal2)
        funds2 = content.find('</dd>', funds1)
        funds = content[funds1+5:funds2].strip()
        # print funds
        # 批准年度
        year1 = content.find(u'批准年度', funds2)
        year2 = content.find('</dd>', year1)
        year = content[year1+5:year2].strip()
        # print year
        # 关键词
        keywords1 = content.find(u'关键词', year2)
        keywords2 = content.find('</dd>', keywords1)
        keywords = content[keywords1+4:keywords2].strip()
        # print keywords
        dc = {}
        dc['title'] = title
        dc['standard_no'] = standard_no
        dc['standard_type'] = standard_type
        dc['supporting_institution'] = supporting_institution
        dc['project_principal'] = project_principal
        dc['funds'] = funds
        dc['year'] = year
        dc['keywords'] = keywords
        return dc
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    ECharts之柱状图 饼状图 折线图
    Vue自定义指令(directive)
    HDU 1231 最大连续子序列
    POJ 2533 Longest Ordered Subsequence
    HDU 1163 Eddy's digital Roots
    HDU 2317 Nasty Hacks
    HDU 2571 命运
    HDU 4224 Enumeration?
    HDU 1257 最少拦截系统
    HDU 2740 Root of the Problem
  • 原文地址:https://www.cnblogs.com/zhangtianyuan/p/8482255.html
Copyright © 2011-2022 走看看