zoukankan      html  css  js  c++  java
  • 高企云爬虫

    下面是小編工作中爬取高企云數據的源代碼,增加验证码识别代码:

      1 # -*- coding:utf-8 -*-
      2 import requests
      3 import time,re
      4 import pytesseract
      5 import urllib
      6 import json
      7 from PIL import Image
      8 from selenium import webdriver
      9 from lxml import etree
     10 from connect_monogo import save #Python单利设计模式
     11 
     12 
     13 class Login(object):
     14 
     15     def __init__(self):
     16         self.s = requests.session()
     17         self.driver = webdriver.Chrome()
     18         #self.driver = webdriver.PhantomJS(executable_path="D:/phantomjs-2.1.1-windows/bin/phantomjs.exe")
     19 
     20     def get_code(self):
     21         url = 'http://www.hights.cn'
     22         self.driver.get('http://www.hights.cn/beetl/login/toLogin.html')
     23         code = self.driver.page_source
     24         #print(code)
     25         patten = re.compile('<img id="codeImg" alt="" src="(.*?)" />')
     26         code_url = patten.findall(code)
     27         #print code_url
     28         if len(code_url) ==0:
     29             return '验证码为空号'
     30         else:
     31             #print( url + code_url[0])
     32             d_url = url + code_url[0]
     33             print(d_url)
     34         try:
     35             urllib.urlretrieve(d_url,'code.jpg')
     36         except IOError:
     37             print('验证码链接错误')
     38         finally:
     39             im = Image.open('code.jpg')
     40             hk = pytesseract.image_to_string(im)
     41             self.driver.find_element_by_name('phone').send_keys('15766264244')
     42             self.driver.find_element_by_name('password').send_keys('123456789')
     43             time.sleep(2)
     44             self.driver.find_element_by_name('code').send_keys(str(hk))
     45             time.sleep(5)
     46             self.driver.find_element_by_id('login_btn').click()
     47             print('登录成功')
     48             time.sleep(55)
     49 
     50 
     51     def Public_list(self):#公司名录,已爬取
     52         'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=2&currentPage=2'
     53         url_01 = 'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=0&currentPage=1'
     54         name = requests.get(url_01,headers=headers)
     55         return(name.text)
     56         # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']})
     57 
     58     def Template_model(self):#模板范文
     59         for page_01 in range(1,3):
     60             url_02 = 'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=2&currentPage='+str(page_01)
     61             s1 = requests.get(url_02,headers=headers)
     62             return json.loads(s1.text,encoding='UTF-8')['data']
     63             # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']})
     64 
     65     def Government_documents(self):#政府文件
     66         for page_02 in range(1,10):
     67             url_03 = 'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=5&currentPage='+str(page_02)
     68             s2 = requests.get(url_03,headers=headers)
     69             return json.loads(s2.text,encoding='UTF-8')['data']
     70             # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']})
     71 
     72     def Policy_interpretation(self):#政策解读
     73         for page_03 in range(1,15):
     74             url_04 = 'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=4&currentPage='+str(page_03)
     75             print(url_04)
     76             s3 = requests.get(url_04,headers=headers)
     77             return json.loads(s3.text,encoding='UTF-8')['data']
     78             # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']})
     79 
     80 
     81     def Other_information(self):#其他资料
     82         for page_04 in range(1,3):
     83             url_05 ='http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=3&currentPage='+str(page_04)
     84             s4 = requests.get(url_05,headers=headers)
     85             return json.loads(s4.text,encoding='UTF-8')['data']
     86             # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']})
     87 
     88     def jiexi_data(self):
     89         #return(save.chaxun())
     90 
     91         pdf_list =[]
     92         for i_json in self.Other_information():
     93             #for ij in i_json['id']:
     94                 targid = i_json['id']
     95                 filepath = i_json['pdfpath']
     96                 pdf_url = 'http://www.hights.cn/beetl/mydocument/download?targetId='+str(targid)+'&clazzName=com.fh.entity.system.Library&filePath='+ str(filepath)
     97                 path = targid+filepath
     98                 pdf_list.append(pdf_url)
     99         print(pdf_list)
    100         #print(type(str(self.shibie_code())))
    101         #ss = str(self.shibie_code())
    102         #self.driver.get('http://www.hights.cn/beetl/login/toLogin.html')
    103         self.driver.find_element_by_name('phone').send_keys('15766264244')
    104         self.driver.find_element_by_name('password').send_keys('123456789')
    105         time.sleep(2)
    106         self.driver.find_element_by_name('code').send_keys(ss)
    107         time.sleep(5)
    108         self.driver.find_element_by_id('login_btn').click()
    109         time.sleep(4)
    110         for i in pdf_list:
    111             self.driver.get(i)
    112             print('下载完成。。。。')
    113         time.sleep(400)
    114 
    115 def main():
    116     r = Login()
    117     r.get_code()
    118     #r.shibie_code()
    119     #r.jk()
    120     #r.jiexi_data()
    121     #r.login_gaoqiyun()
    122     #r.Public_list()
    123     #r.Template_model()
    124     #print(r.Government_documents())
    125     #print(r.Policy_interpretation())
    126     #print(r.jiexi_data())
    127 
    128 if __name__ =='__main__':
    129     main()
  • 相关阅读:
    【Nowcoder71E】组一组(差分约束,最短路)
    【CodeChef】Querying on a Grid(分治,最短路)
    【BZOJ4061】[Cerc2012]Farm and factory(最短路,构造)
    【AtCoder3611】Tree MST(点分治,最小生成树)
    【AtCoder2134】ZigZag MST(最小生成树)
    【CF891C】Envy(最小生成树)
    【BZOJ5339】[TJOI2018]教科书般的亵渎(斯特林数)
    【BZOJ5337】[TJOI2018]str(动态规划,哈希)
    【BZOJ5336】[TJOI2018]party(动态规划)
    【BZOJ5335】[TJOI2018]智力竞赛(二分图匹配)
  • 原文地址:https://www.cnblogs.com/Huangsh2017Come-on/p/8378252.html
Copyright © 2011-2022 走看看