zoukankan      html  css  js  c++  java
  • 第一个爬虫

    import pymongo
    import requests
    import re
    import time
    from pyquery import PyQuery as pq
    
    #抓取页面Html
    def GetHtml(url,page):
    
        time.sleep(3)
        print("当前页%d"%(int(page)+1))
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
        }
        fromContent = {
            '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridViewData$ctl13$ddlJumpToPageNum',
            '__VIEWSTATE': '/wEPDwULLTE1ODA5ODgxODgPZBYCZg9kFgICAw9kFgICAQ9kFgYCAQ8QDxYGHg1EYXRhVGV4dEZpZWxkBQROYW1lHg5EYXRhVmFsdWVGaWVsZAUFWUNvZGUeC18hRGF0YUJvdW5kZ2QQFQQIPeWFqOmDqD0M6YOo5bGe6auY5qChDOacrOenkemrmOagoQzpq5jogYzpq5jkuJMVBAAEMDEwMgQwMTAzBDAxMDQUKwMEZ2dnZ2RkAgUPEA8WBh8ABQRuYW1lHwEFBXhjb2RlHwJnZBAVBAg95YWo6YOoPQbmqKrlkJEG57q15ZCRBuagoeWGhRUEAAgwMjEzMDEwMQgwMjEzMDEwMggwMjEzMDEwMxQrAwRnZ2dnZGQCCw88KwANAQAPFgQfAmceC18hSXRlbUNvdW50AtYLZBYCZg9kFhgCAQ9kFhBmD2QWAmYPFQIDMTQyCDIxMTAxMDUwZAIBD2QWAmYPFQJa5rCu5p2C546v5Y2h5a6+6ZOx44CB6ZOR5aSn546v5YyW5ZCI54mp55qE5ZCI5oiQ44CB57uT5p6E5Y+K5YKs5YyW56GF5rCi5Yqg5oiQ5Y+N5bqU56CU56m2WuawruadgueOr+WNoeWuvumTseOAgemTkeWkp+eOr+WMluWQiOeJqeeahOWQiOaIkOOAgee7k+aehOWPiuWCrOWMluehheawouWKoOaIkOWPjeW6lOeglOeptmQCAg9kFgJmDxUCBue6teWQkQbnurXlkJFkAgMPZBYCZg8VAhvlm73lrrboh6rnhLbnp5Hlrabln7rph5Hlp5Qb5Zu95a626Ieq54S256eR5a2m5Z+66YeR5aeUZAIED2QWAmYPFQIn5pyJ5py656GF5YyW5a2m5Y+K5p2Q5paZ5oqA5pyv5a6e6aqM5a6kJ+acieacuuehheWMluWtpuWPiuadkOaWmeaKgOacr+WunumqjOWupGQCBQ9kFgJmDxUCCjIwMTEtMDgtMjUKMjAxMS0wOC0yNWQCBg9kFgJmDxUCCTI1MDAwMC4wMAkyNTAwMDAuMDBkAgcPZBYCZg8VAgnmlYjml63nkLwJ5pWI5pet55C8ZAICD2QWEGYPZBYCZg8VAgMxNDMIMjAxM1owMzhkAgEPZBYCZg8VAi3ojbfovb3kvZznlKjkuIvmt7flh53lnJ/nmoTmuJfpgI/mgKfog73noJTnqbYt6I236L295L2c55So5LiL5re35Yed5Zyf55qE5riX6YCP5oCn6IO956CU56m2ZAICD2QWAmYPFQIG57q15ZCRBue6teWQkWQCAw9kFgJmDxUCEua1meaxn+ecgeW7uuiuvuWOhRLmtZnmsZ/nnIHlu7rorr7ljoVkAgQPZBYCZg8VAgnkv6Hmga/ns7sJ5L+h5oGv57O7ZAIFD2QWAmYPFQIKMjAxMy0wOS0wMQoyMDEzLTA5LTAxZAIGD2QWAmYPFQIHNDAwMC4wMAc0MDAwLjAwZAIHD2QWAmYPFQIJ56ul5oWn6IqdCeerpeaFp+iKnWQCAw9kFhBmD2QWAmYPFQIDMTQ0CDIxMjAyMDMxZAIBD2QWAmYPFQI1UHJpbnPmiJDnjq/mlrDnrZbnlaXlnKjlpKnnhLbkuqfnianlkIjmiJDkuK3nmoTlupTnlKg1UHJpbnPmiJDnjq/mlrDnrZbnlaXlnKjlpKnnhLbkuqfnianlkIjmiJDkuK3nmoTlupTnlKhkAgIPZBYCZg8VAgbnurXlkJEG57q15ZCRZAIDD2QWAmYPFQIb5Zu95a626Ieq54S256eR5a2m5Z+66YeR5aeUG+WbveWutuiHqueEtuenkeWtpuWfuumHkeWnlGQCBA9kFgJmDxUCJ+acieacuuehheWMluWtpuWPiuadkOaWmeaKgOacr+WunumqjOWupCfmnInmnLrnoYXljJblrablj4rmnZDmlpnmioDmnK/lrp7pqozlrqRkAgUPZBYCZg8VAgoyMDEyLTA4LTE3CjIwMTItMDgtMTdkAgYPZBYCZg8VAgkyNTAwMDAuMDAJMjUwMDAwLjAwZAIHD2QWAmYPFQIJ6ZmI5beN5bOwCemZiOW3jeWzsGQCBA9kFhBmD2QWAmYPFQIDMTQ1CjIwMDhDMTQwNDFkAgEPZBYCZg8VAjnlpJrmmbbnoYXkuqfkuJrlia/kuqfnianlm5vmsK/ljJbnoYXnmoTnu7zlkIjliKnnlKjnoJTnqbY55aSa5pm256GF5Lqn5Lia5Ymv5Lqn54mp5Zub5rCv5YyW56GF55qE57u85ZCI5Yip55So56CU56m2ZAICD2QWAmYPFQIG57q15ZCRBue6teWQkWQCAw9kFgJmDxUCEua1meaxn+ecgeenkeaKgOWOhRLmtZnmsZ/nnIHnp5HmioDljoVkAgQPZBYCZg8VAifmnInmnLrnoYXljJblrablj4rmnZDmlpnmioDmnK/lrp7pqozlrqQn5pyJ5py656GF5YyW5a2m5Y+K5p2Q5paZ5oqA5pyv5a6e6aqM5a6kZAIFD2QWAmYPFQIKMjAwOC0wNy0wNgoyMDA4LTA3LTA2ZAIGD2QWAmYPFQIKMTAwMDAwMC4wMAoxMDAwMDAwLjAwZAIHD2QWAmYPFQIJ5b2t5a625bu6CeW9reWutuW7umQCBQ9kFhBmD2QWAmYPFQIDMTQ2DDIwMTMwNTMzQjExIGQCAQ9kFgJmDxUCUeeUn+eJqeaTjee6teaKgOacr+WcqOmdkuWxsea5luawtOW6k+WvjOiQpeWFu+WMluawtOS9k+S/ruWkjeS4reeahOeglOeptuS4juW6lOeUqFHnlJ/nianmk43nurXmioDmnK/lnKjpnZLlsbHmuZbmsLTlupPlr4zokKXlhbvljJbmsLTkvZPkv67lpI3kuK3nmoTnoJTnqbbkuI7lupTnlKhkAgIPZBYCZg8VAgbnurXlkJEG57q15ZCRZAIDD2QWAmYPFQIS5p2t5bee5biC56eR5oqA5bGAEuadreW3nuW4guenkeaKgOWxgGQCBA9kFgJmDxUCG+eUn+WRveS4jueOr+Wig+enkeWtpuWtpumZohvnlJ/lkb3kuI7njq/looPnp5HlrablrabpmaJkAgUPZBYCZg8VAgoyMDEzLTExLTA0CjIwMTMtMTEtMDRkAgYPZBYCZg8VAgkxMDAwMDAuMDAJMTAwMDAwLjAwZAIHD2QWAmYPFQIG5aec5Li5BuWnnOS4uWQCBg9kFhBmD2QWAmYPFQIDMTQ3CDMxMTAwNTgzZAIBD2QWAmYPFQJL5LiA56eN5paw5Z6L6ICQ54Ot6Z2e54m55byC5oCn5qC46YW46YW255qE5L2c55So5py65Yi25Y+K5Yqf6IO96L+b5YyW56CU56m2S+S4gOenjeaWsOWei+iAkOeDremdnueJueW8guaAp+aguOmFuOmFtueahOS9nOeUqOacuuWItuWPiuWKn+iDvei/m+WMlueglOeptmQCAg9kFgJmDxUCBue6teWQkQbnurXlkJFkAgMPZBYCZg8VAg/lm73lrrbln7rph5Hlp5QP5Zu95a625Z+66YeR5aeUZAIED2QWAmYPFQIb55Sf5ZG95LiO546v5aKD56eR5a2m5a2m6ZmiG+eUn+WRveS4jueOr+Wig+enkeWtpuWtpumZomQCBQ9kFgJmDxUCCjIwMTItMDEtMDEKMjAxMi0wMS0wMWQCBg9kFgJmDxUCCTIzMDAwMC4wMAkyMzAwMDAuMDBkAgcPZBYCZg8VAgnnn7PpmYblqKUJ55+z6ZmG5ailZAIHD2QWEGYPZBYCZg8VAgMxNTALTFExM0gzMTAwMDRkAgEPZBYCZg8VAmDoh6rlmazlnKjph5HoibLphbDog7rphofpha/or7Hlr7zkurrnpZ7nu4/og7botKjnmKTnu4bog57mrbvkuqHov4fnqIvkuK3nmoTkvZznlKjlj4rosIPmjqfmnLrliLZg6Ieq5Zms5Zyo6YeR6Imy6YWw6IO66YaH6YWv6K+x5a+85Lq656We57uP6IO26LSo55ik57uG6IOe5q275Lqh6L+H56iL5Lit55qE5L2c55So5Y+K6LCD5o6n5py65Yi2ZAICD2QWAmYPFQIG57q15ZCRBue6teWQkWQCAw9kFgJmDxUCJOa1meaxn+ecgeiHqueEtuenkeWtpuWfuumHkeWnlOWRmOS8miTmtZnmsZ/nnIHoh6rnhLbnp5Hlrabln7rph5Hlp5TlkZjkvJpkAgQPZBYCZg8VAhvljLvlrabpg6ggLSDln7rnoYDljLvlrabpg6gb5Yy75a2m6YOoIC0g5Z+656GA5Yy75a2m6YOoZAIFD2QWAmYPFQIKMjAxMy0wNS0wNgoyMDEzLTA1LTA2ZAIGD2QWAmYPFQIINTAwMDAuMDAINTAwMDAuMDBkAgcPZBYCZg8VAgbmnajmgKEG5p2o5oChZAIID2QWEGYPZBYCZg8VAgMxNTEIODEwNzE2MzZkAgEPZBYCZg8VAj9MZXQtN+WcqOWNoeazouawj+iCieeYpOeXheavkuWkjeWItuS4reeahOS9nOeUqOWPiuacuuWItueglOeptiA/TGV0LTflnKjljaHms6LmsI/ogonnmKTnl4Xmr5LlpI3liLbkuK3nmoTkvZznlKjlj4rmnLrliLbnoJTnqbYgZAICD2QWAmYPFQIG57q15ZCRBue6teWQkWQCAw9kFgJmDxUCIuWbveWutnroh6rnhLbnp5Hlrabln7rph5Hlp5TlkZjkvJoi5Zu95a62euiHqueEtuenkeWtpuWfuumHkeWnlOWRmOS8mmQCBA9kFgJmDxUCCeWMu+WtpumDqAnljLvlrabpg6hkAgUPZBYCZg8VAgoyMDEwLTA5LTAxCjIwMTAtMDktMDFkAgYPZBYCZg8VAgkzNTAwMDAuMDAJMzUwMDAwLjAwZAIHD2QWAmYPFQIH5p2o56OKIAfmnajno4ogZAIJD2QWEGYPZBYCZg8VAgMxNTIZ5rWZ6LSi5bu644CUMjAxM+OAlTM1N+WPt2QCAQ9kFgJmDxUCQuWkp+Wei+mSouWCqOe9kOWGhemDqOeIhueCuOWGsuWHu+iNt+i9veaVsOWAvOaooeaLn+WPiuegtOWdj+WIhuaekELlpKflnovpkqLlgqjnvZDlhoXpg6jniIbngrjlhrLlh7vojbfovb3mlbDlgLzmqKHmi5/lj4rnoLTlnY/liIbmnpBkAgIPZBYCZg8VAgbnurXlkJEG57q15ZCRZAIDD2QWAmYPFQIh5rWZ5rGf55yB5L2P5oi/5ZKM5Z+O5Lmh5bu66K6+5Y6FIea1meaxn+ecgeS9j+aIv+WSjOWfjuS5oeW7uuiuvuWOhWQCBA9kFgJmDxUCCeW7uuetkeezuwnlu7rnrZHns7tkAgUPZBYCZg8VAgoyMDEzLTA5LTI0CjIwMTMtMDktMjRkAgYPZBYCZg8VAgg1MDAwMC4wMAg1MDAwMC4wMGQCBw9kFgJmDxUCCeW6nuW0h+WuiQnlup7ltIflrolkAgoPZBYQZg9kFgJmDxUCAzE1NAgxMTIyNjI5OGQCAQ9kFgJmDxUCOOi3nemHj+enr+WIhuS4jkNhcmxpdHrlnotxLUhhcmR54oCTSGlsbGXlhazlvI/nmoTnoJTnqbYgOOi3nemHj+enr+WIhuS4jkNhcmxpdHrlnotxLUhhcmR54oCTSGlsbGXlhazlvI/nmoTnoJTnqbYgZAICD2QWAmYPFQIG57q15ZCRBue6teWQkWQCAw9kFgJmDxUCG+WbveWutuiHqueEtuenkeWtpuWfuumHkeWnlBvlm73lrrboh6rnhLbnp5Hlrabln7rph5Hlp5RkAgQPZBYCZg8VAgnnkIblrabpmaIJ55CG5a2m6ZmiZAIFD2QWAmYPFQIKMjAxMi0xMS0xNgoyMDEyLTExLTE2ZAIGD2QWAmYPFQIIMzAwMDAuMDAIMzAwMDAuMDBkAgcPZBYCZg8VAgbmm7nlgaUG5pu55YGlZAILDw8WAh4HVmlzaWJsZWhkZAIMD2QWAmYPZBYCZg9kFgJmD2QWBGYPFgIeCWlubmVyaHRtbAUy5YWxIDE0OTQg5p2h6K6w5b2V77yM5b2T5YmN56ysIDEg6aG177yM5YWxIDE1MCDpobVkAgEPZBYCAgUPEA8WAh8CZ2QQFZYBATEBMgEzATQBNQE2ATcBOAE5AjEwAjExAjEyAjEzAjE0AjE1AjE2AjE3AjE4AjE5AjIwAjIxAjIyAjIzAjI0AjI1AjI2AjI3AjI4AjI5AjMwAjMxAjMyAjMzAjM0AjM1AjM2AjM3AjM4AjM5AjQwAjQxAjQyAjQzAjQ0AjQ1AjQ2AjQ3AjQ4AjQ5AjUwAjUxAjUyAjUzAjU0AjU1AjU2AjU3AjU4AjU5AjYwAjYxAjYyAjYzAjY0AjY1AjY2AjY3AjY4AjY5AjcwAjcxAjcyAjczAjc0Ajc1Ajc2Ajc3Ajc4Ajc5AjgwAjgxAjgyAjgzAjg0Ajg1Ajg2Ajg3Ajg4Ajg5AjkwAjkxAjkyAjkzAjk0Ajk1Ajk2Ajk3Ajk4Ajk5AzEwMAMxMDEDMTAyAzEwMwMxMDQDMTA1AzEwNgMxMDcDMTA4AzEwOQMxMTADMTExAzExMgMxMTMDMTE0AzExNQMxMTYDMTE3AzExOAMxMTkDMTIwAzEyMQMxMjIDMTIzAzEyNAMxMjUDMTI2AzEyNwMxMjgDMTI5AzEzMAMxMzEDMTMyAzEzMwMxMzQDMTM1AzEzNgMxMzcDMTM4AzEzOQMxNDADMTQxAzE0MgMxNDMDMTQ0AzE0NQMxNDYDMTQ3AzE0OAMxNDkDMTUwFZYBATABMQEyATMBNAE1ATYBNwE4ATkCMTACMTECMTICMTMCMTQCMTUCMTYCMTcCMTgCMTkCMjACMjECMjICMjMCMjQCMjUCMjYCMjcCMjgCMjkCMzACMzECMzICMzMCMzQCMzUCMzYCMzcCMzgCMzkCNDACNDECNDICNDMCNDQCNDUCNDYCNDcCNDgCNDkCNTACNTECNTICNTMCNTQCNTUCNTYCNTcCNTgCNTkCNjACNjECNjICNjMCNjQCNjUCNjYCNjcCNjgCNjkCNzACNzECNzICNzMCNzQCNzUCNzYCNzcCNzgCNzkCODACODECODICODMCODQCODUCODYCODcCODgCODkCOTACOTECOTICOTMCOTQCOTUCOTYCOTcCOTgCOTkDMTAwAzEwMQMxMDIDMTAzAzEwNAMxMDUDMTA2AzEwNwMxMDgDMTA5AzExMAMxMTEDMTEyAzExMwMxMTQDMTE1AzExNgMxMTcDMTE4AzExOQMxMjADMTIxAzEyMgMxMjMDMTI0AzEyNQMxMjYDMTI3AzEyOAMxMjkDMTMwAzEzMQMxMzIDMTMzAzEzNAMxMzUDMTM2AzEzNwMxMzgDMTM5AzE0MAMxNDEDMTQyAzE0MwMxNDQDMTQ1AzE0NgMxNDcDMTQ4AzE0ORQrA5YBZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnFgFmZBgCBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAQUjY3RsMDAkQ29udGVudFBsYWNlSG9sZGVyMSRidG5TZWFyY2gFJmN0bDAwJENvbnRlbnRQbGFjZUhvbGRlcjEkR3JpZFZpZXdEYXRhDzwrAAoBCAKWAWThRBCCQv705jTgfanMwoOAORKRLg==',
            'ctl00$ContentPlaceHolder1$GridViewData$ctl13$ddlJumpToPageNum': page
        }
        try:
            response = requests.post(url=url, data=fromContent,headers=headers, timeout=10)
            response.encoding=response.apparent_encoding
            if response.status_code==200:
                print("获取源码成功")
        except Exception as e:
            print('获取源码失败:%s'%e)
        return response.text
    
    def Get_ZIHtml(url):
        try:
            time.sleep(2)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
            }
            response=requests.get(url=url,headers=headers)
            response.encoding = response.apparent_encoding
            print('获取子页面成功')
        except Exception as e:
            print('获取子页面失败%s'%e)
        return  response.text
    
    #获取页面总页数
    def Get_Pagenumber(url,page):
        try:
            html = GetHtml(url, page)
            doc = pq(html)
            pageItm = doc('#ctl00_ContentPlaceHolder1_GridViewData .td0 tr td')
            page = re.findall(r",共(.+?)页", pageItm.text())[0]
            print('获取页数成功')
        except Exception as e:
            print('获取页数失败%s'%e)
        return page
    
    #mongoDB连接(保存主页数据)
    def Con_MongoDB():
        try:
            client = pymongo.MongoClient("localhost", 27017)
            db = client.ZJGX
            print('MongoDB连接成功')
        except Exception as e:
            print('MongoDB连接失败:%s'%e)
        return db
    
    #获取Html遍历所有数据
    def Get_Data(page,db,url):
        i = 0
        collection = db.SY#首页
        collection1 = db.LXXX#立项信息
        collection2 = db.FZR#项目负责人及课题组成员
        collection3 = db.JFYS#经费预算及执行
        collection4 = db.JFLJ#经费累计收支情况
        collection5 = db.JFZC#经费支出情况
        collection6 = db.KYCG#科研成果
        collection7 = db.DESP#大额设备和材料名称和价格
        collection8= db.XMYS#结题项目验收
    
        while i < int(page):
            html = GetHtml(url, i)
            doc = pq(html)
            print(i)
            # 项目主表标题
            headerItem = doc('#ctl00_ContentPlaceHolder1_GridViewData .td_bthead')
            headerList = []
            for x in headerItem('th').items():
                headerList.append(x.text())
            bodyItem = doc('#ctl00_ContentPlaceHolder1_GridViewData .hashover')
            resultList = []
            for x in bodyItem('tr'):
    
                #子页面数据
                ZIurl = 'http://kyjf.zjedu.gov.cn/kyjfgk/%s' % pq(x).find('td a').attr('href')
                ZIhtml = Get_ZIHtml(ZIurl)
                ZIdoc = pq(ZIhtml,parser="html")
                #立项信息
                LXXX=ZIdoc('.tablelist1').children().children().children()
                Dic_LXXX= {
                    LXXX.eq(0).text(): LXXX.eq(1).text(),
                    LXXX.eq(2).text(): LXXX.eq(3).text(),
    
                    LXXX.eq(4).text(): LXXX.eq(5).text(),
                    LXXX.eq(6).text(): LXXX.eq(7).text(),
    
                    LXXX.eq(8).text(): LXXX.eq(9).text(),
                    LXXX.eq(10).text(): LXXX.eq(11).text(),
    
                    LXXX.eq(12).text(): LXXX.eq(13).text(),
                    LXXX.eq(14).text(): LXXX.eq(15).text(),
    
                    LXXX.eq(16).text(): LXXX.eq(17).text(),
                    LXXX.eq(18).text(): LXXX.eq(19).text(),
    
                    LXXX.eq(20).text(): LXXX.eq(21).text(),
                    LXXX.eq(22).text(): LXXX.eq(23).text(),
                    'FId':pq(x).find('td').eq(0).text()
                }
                collection1.insert_one(Dic_LXXX)
    
                #项目负责人以及课题组成员
                FZRTh = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData1 .td_bthead')
                FZRheader = []
                for Fth in FZRTh.children().items():
                    FZRheader.append(Fth.text())
                FZRTr = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData1 .hashover')
                for Ftr in FZRTr:
                    FZHItem = {
                        FZRheader[0]: pq(Ftr).children().eq(0).text(),
                        FZRheader[1]: pq(Ftr).children().eq(1).text(),
                        FZRheader[2]: pq(Ftr).children().eq(2).text(),
                        FZRheader[3]: pq(Ftr).children().eq(3).text(),
                        FZRheader[4]: pq(Ftr).children().eq(4).text(),
                        'FId':pq(x).find('td').eq(0).text()
                    }
                    collection2.insert_one(FZHItem)
    
                #经费预算及执行
                JFYSTh = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData9 .td_bthead')
                JFYSheader = []
                for JFYSThItem in JFYSTh.children().items():
                    JFYSheader.append(JFYSThItem.text())
                JFYSTr = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData9 .hashover')
                for JFYSTh in JFYSTr:
                    JFYSItem = {
                        JFYSheader[0]: pq(JFYSTh).children().eq(0).text(),
                        JFYSheader[1]: pq(JFYSTh).children().eq(1).text(),
                        JFYSheader[2]: pq(JFYSTh).children().eq(2).text(),
                        JFYSheader[3]: pq(JFYSTh).children().eq(3).text(),
                        JFYSheader[4]: pq(JFYSTh).children().eq(4).text(),
                        JFYSheader[5]: pq(JFYSTh).children().eq(5).text(),
                        'FId': pq(x).find('td').eq(0).text()
                    }
                    collection3.insert_one(JFYSItem)
    
                #经费累计收支情况
                JFLJTh = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData2 .td_bthead')
                JFLJheader = []
                for  JFLJThItem in  JFLJTh.children().items():
                     JFLJheader.append(JFLJThItem.text())
                JFLJTr = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData2 .hashover')
                for  JFLJTh in  JFLJTr:
                     JFLJItem = {
                         JFLJheader[0]: pq( JFLJTh).children().eq(0).text(),
                         JFLJheader[1]: pq( JFLJTh).children().eq(1).text(),
                         JFLJheader[2]: pq( JFLJTh).children().eq(2).text(),
                         JFLJheader[3]: pq( JFLJTh).children().eq(3).text(),
                         JFLJheader[4]: pq( JFLJTh).children().eq(4).text(),
                         JFLJheader[5]: pq( JFLJTh).children().eq(5).text(),
                         'FId': pq(x).find('td').eq(0).text()
                     }
                     collection4.insert_one(JFLJItem)
    
                #经费支出
                JFZCTitle = ZIdoc('ul[class="listtab"] li')
                JFZCheader = []
                JFZCTh = ZIdoc('#ctl00_ContentPlaceHolder1_UcJf1_GridViewData .td_bthead')
                for JFZCThItem in JFZCTh.children().items():
                    JFZCheader.append(JFZCThItem.text())
                y = 1
                for JFZC in JFZCTitle.items():
                    JFZCTr = ZIdoc('#ctl00_ContentPlaceHolder1_UcJf%d_GridViewData .hashover' % y)
                    y = y + 1
                    for JFZCTh in JFZCTr:
                        JFZCItem = {
                            'Title': JFZC.text(),
                            JFZCheader[0]: pq(JFZCTh).children().eq(0).text(),
                            JFZCheader[1]: pq(JFZCTh).children().eq(1).text(),
                            JFZCheader[2]: pq(JFZCTh).children().eq(2).text(),
                            'FId': pq(x).find('td').eq(0).text()
                        }
                        collection5.insert_one(JFZCItem)
    
                #科研成果
                KYCGheader = []
                KYCGTh = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData4 .td_bthead')
                for KYCGThItem in KYCGTh.children().items():
                    KYCGheader.append(KYCGThItem.text())
                KYCGTr = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData4 .hashover')
                for KYCGTh in KYCGTr:
                    KYCGItem = {
                        KYCGheader[0]: pq(KYCGTh).children().eq(0).text(),
                        KYCGheader[1]: pq(KYCGTh).children().eq(1).text(),
                        KYCGheader[2]: pq(KYCGTh).children().eq(2).text(),
                        KYCGheader[3]: pq(KYCGTh).children().eq(3).text(),
                        KYCGheader[4]: pq(KYCGTh).children().eq(4).text(),
                        'FId': pq(x).find('td').eq(0).text()
                    }
                    collection6.insert_one(KYCGItem)
                #大额设备和材料名称和价格
                DGSPheader = []
                DGSPTh = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData7 .td_bthead')
                for DGSPThItem in DGSPTh.children().items():
                    DGSPheader.append(DGSPThItem.text())
                DGSPTr = ZIdoc('#ctl00_ContentPlaceHolder1_GridViewData7 .hashover')
                for DGSPTh in DGSPTr:
                    DGSPItem = {
                        DGSPheader[0]: pq(DGSPTh).children().eq(0).text(),
                        DGSPheader[1]: pq(DGSPTh).children().eq(1).text(),
                        DGSPheader[2]: pq(DGSPTh).children().eq(2).text(),
                        DGSPheader[3]: pq(DGSPTh).children().eq(3).text(),
                        DGSPheader[4]: pq(DGSPTh).children().eq(4).text(),
                        DGSPheader[5]: pq(DGSPTh).children().eq(5).text(),
                        DGSPheader[6]: pq(DGSPTh).children().eq(6).text(),
                        DGSPheader[7]: pq(DGSPTh).children().eq(7).text(),
                        'FId': pq(x).find('td').eq(0).text()
                    }
                    collection7.insert_one(DGSPItem)
                #结题项目验收
                JTXM = ZIdoc('#ctl00_ContentPlaceHolder1_div8 .tablelist1 td')
                Dic_JTXM = {
                    JTXM.eq(0).text(): JTXM.eq(1).text(),
                    JTXM.eq(2).text(): JTXM.eq(3).text(),
    
                    JTXM.eq(4).text(): JTXM.eq(5).text(),
                    JTXM.eq(6).text(): JTXM.eq(7).text(),
    
                    JTXM.eq(8).text(): JTXM.eq(9).text(),
                    JTXM.eq(10).text(): JTXM.eq(11).text(),
                    'FId': pq(x).find('td').eq(0).text()
                }
                collection8.insert_one(Dic_JTXM)
    
    
                #首页数据
                result = {
                    "page": i+1,
                    'url':"http://kyjf.zjedu.gov.cn/kyjfgk/"+pq(x).find('td a').attr('href'),
                    headerList[0]: pq(x).find('td').eq(0).text(),
                    headerList[1]: pq(x).find('td').eq(1).text(),
                    headerList[2]: pq(x).find('td').eq(2).text(),
                    headerList[3]: pq(x).find('td').eq(3).text(),
                    headerList[4]: pq(x).find('td').eq(4).text(),
                    headerList[5]: pq(x).find('td').eq(5).text(),
                    headerList[6]: pq(x).find('td').eq(6).text(),
                    headerList[7]: pq(x).find('td').eq(7).text(),
                }
                resultList.append(result)
            collection.insert_many(resultList)
            print("---第%d页完成---" % (int(i)+1))
            i += 1
        print('----数据获取成功!----')
    
    if __name__=='__main__':
        try:
            url = "http://kyjf.zjedu.gov.cn/kyjfgk/Item_Xm_query.aspx"
            page=Get_Pagenumber(url,1)
            print(page)
            db=Con_MongoDB()
            Get_Data(page,db,url)
            print('获取完成')
        except Exception as e:
            print('数据获取失败%s'%e)
  • 相关阅读:
    sitemesh包装工具
    关于对XML的处理
    关于打开tomcat的远程调试功能
    hdu4531 乾坤大挪移
    hdu4521 小明序列 (线段树 + DP)
    hdu4527 && hdu4528
    zoj3691 Flower
    pku2817 WordStack
    zoj3652 Maze
    zoj3381 Osaisen Choudai!
  • 原文地址:https://www.cnblogs.com/zldqpm/p/10789194.html
Copyright © 2011-2022 走看看