爬虫—启新宝接口函数

zoukankan html css js c++ java

爬虫—启新宝接口函数

from selenium import webdriver
import time
import re
from bs4 import BeautifulSoup

#************************************************************************
#定义login函数
def login(username,password):
driver = webdriver.Firefox(executable_path='C:/Program Files (x86)/Mozilla Firefox/firefox.exe')
driver.get('http://www.qixin.com/login')
time.sleep(2)
driver.maximize_window()
driver.find_element_by_link_text('QQ登录').click()
driver.switch_to_frame('ptlogin_iframe')#切换到iframe框架
driver.find_element_by_link_text('帐号密码登录').click()
driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys(username)#需要更换的QQ账号
driver.find_element_by_id('p').send_keys(password)#对应QQ账号的密码密码
driver.find_element_by_class_name('login_button').click()
time.sleep(5)
return driver
#************************************************************************
#获得搜索结果页面
def search_ename(driver,i):
#搜索企业名称并抓取该url地址
url = 'http://www.qixin.com/search?key='+i+'&type=enterprise&method=all'
print('正在抓取:'+url)
driver.get(url)
return driver
#************************************************************************
#从搜索结果中提取出企业链接(只提取第一个)
def get_enameurl(driver):
#利用BeautifulSoup匹配公司名
soup = BeautifulSoup(driver.page_source)
sfa = soup.find_all(href=re.compile('/company/gongsi'))
if len(sfa)!=0:
eurl = 'http://www.qixin.com'+sfa[0]['href']
else:
eurl = '不存在'
return eurl
#************************************************************************
#定义匹配基本信息函数
def get_basic_info(soup):
#匹配工商基本信息########################
a = soup.find_all(class_="panel panel-default basic-info")
#s用来存储企业工商基本信息
s = ''
#将工商基本信息中各个字段的值匹配出来存储进s
for i in a[0].find_all('span')[1:]:
s = s+' '+i.string

#因为法人代表字段是链接，所以不能用以上方式匹配，下面是匹配法人代表的方式
#判断法人代表字段是否为空，不为空执行以下操作
if len(soup.find_all(href=re.compile('search')))!=0:
#匹配法人代表的值
faren = soup.find_all(href=re.compile('search'))[0].string
#判断法人代表的值类型是否是str（不是str就是NoneType类型，无法与s进行相加）
if isinstance(faren,str):
s = faren + s
else:
s = '-' + s
#法人代表字段如果为空，执行以下操作
else:
faren = '-'
s = fanren + s
#将企业名称存储进s(首先判断企业名称是否为空)
try:
qiyemingcheng = soup.title.string.split('|')[0]
except:
qiyemingcheng = '-'

s = qiyemingcheng + ' ' + s +' '

#匹配股东信息和主要人员信息
a = soup.find_all(class_='panel panel-default',style="margin-top:0")
#匹配股东信息########################
#不是链接的股东信息
gudonglist = a[0].find_all('span',class_=None)
if len(gudonglist)!=0:
for gudong in gudonglist:
s = s+gudong.string+' '
#是链接的股东信息
gudonglist2 = a[0].find_all('a')
if len(gudonglist2)!=0:
for gudong in gudonglist2:
s=s+gudong.string+' '

if len(gudonglist)==0 and len(gudonglist2)==0:
s=s+'-'
s=s+' '
#匹配主要人员信息########################
bosslist = a[1].find_all(class_='enterprise-employee-name')
if len(bosslist)!=0:
for boss in bosslist:
s=s+boss.string+' '
else:
s=s+'-'
s=s+' '
#将s写入文本文件
print('成功写入基本信息数据 ************** ')
return s
#************************************************************************
#定义匹配风险信息函数
def get_risk_info(soup):
a=soup.find_all(class_="risk-list-item")
s=''
#匹配工商变更信息########################
changerecord = soup.find(id="changerecord").find_all(class_='risk-list-item')
if len(changerecord)!=0:
for i in changerecord:
cr = i.get_text()
cr = cr.replace(' ','')
cr = cr.replace(' ','')
s=s+cr+' '
else:
s=s+'-'
s=s+' '
#匹配法院判决信息########################
lawsuit = soup.find(id="lawsuit").find_all(class_='risk-list-item')
if len(lawsuit)!=0:
for i in lawsuit:
ls = i.get_text()
ls = ls.replace(' ','')
ls = ls.replace(' ','')
s=s+ls+' '
else:
s=s+'-'
s=s+' '
#匹配法院公告信息########################
notice = soup.find(id="notice").find_all(class_='risk-list-item')
if len(notice)!=0:
for i in notice:
nt = i.get_text()
nt = nt.replace(' ','')
nt = nt.replace(' ','')
s=s+nt+' '
else:
s=s+'-'
s=s+' '
#匹配被执行人信息########################
executionperson = soup.find(id="executionperson").find_all(class_='risk-list-item')
if len(executionperson)!=0:
for i in executionperson:
ep = i.get_text()
ep = ep.replace(' ','')
ep = ep.replace(' ','')
s=s+ep+' '
else:
s=s+'-'
s=s+' '
#匹配失信人信息########################
dishonest = soup.find(id="dishonest").find_all(class_='risk-list-item')
if len(dishonest)!=0:
for i in dishonest:
ds = i.get_text()
ds = ds.replace(' ','')
ds = ds.replace(' ','')
s=s+ds+' '
else:
s=s+'-'
s=s+' '
#匹配司法拍卖信息########################
sfpm = soup.find(id="sfpm").find_all(class_='risk-list-item')
if len(sfpm)!=0:
for i in sfpm:
sf = i.get_text()
sf = sf.replace(' ','')
sf = sf.replace(' ','')
s=s+sf+' '
else:
s=s+'-'
s=s+' '
#匹配经营异常信息########################
abnormal = soup.find(id="abnormal").find_all(class_='risk-list-item')
if len(abnormal)!=0:
for i in abnormal:
ab = i.get_text()
ab = ab.replace(' ','')
ab = ab.replace(' ','')
s=s+ab+' '
else:
s=s+'-'
s=s+' '
#将s写入文本文件
print('成功写入风险信息数据 ************** ')
return s
#************************************************************************
#定义匹配对外投资函数
def get_investment_info(soup):
a = soup.find_all(class_="search-result-title" )
s=''
if len(a)!=0:
for i in a:
s=s+i.get_text()+' '
else:
s=s+'-'
s=s+' '
#将s写入文本文件
print('成功写入对外投资数据 ************** ')
return s
#************************************************************************
#定义匹配企业年报函数
def get_report_info(soup):
s=''
#匹配企业基本信息########################
a=soup.find(class_="panel-body report-info info-part")
info = a.find_all('span')
for i in info:
infovalue = i.get_text()
if infovalue=='':
infovalue='-'
s=s+infovalue+' '
s=s+' '

#匹配企业资产状况信息########################
a = soup.find(class_="table table-bordered table-striped",style="margin-top:10px;")
companyfinance = a.find_all('td')
for i in companyfinance:
cfvalue = i.get_text()
if cfvalue=='':
cfvalue='-'
s=s+cfvalue+' '
s=s+' '
#将s写入文本文件
print('成功写入企业年报数据 ************** ')
return s
#************************************************************************
#抓取企业链接(eurl)关联的网页，匹配出需要的字段，并写入文本文件
def get_evalue(driver,eurl):
s=''
#抓取企业链接
print('正在抓取'+eurl)
driver.get(eurl)

#将页面源码与BeautifulSoup关联
soup = BeautifulSoup(driver.page_source)

#抓取基本信息
s = s + get_basic_info(soup)

#抓取风险信息,首先点击切换到风险信息栏
try:
driver.find_element_by_xpath("//a[@href='#risk']").click()
time.sleep(1)
soup = BeautifulSoup(driver.page_source)
s = s + get_risk_info(soup)
except:
s = s + '- - - - - - - '

#抓取对外投资信息,首先点击切换到对外投资信息栏
try:
driver.find_element_by_xpath("//a[@href='#investment']").click()
time.sleep(1)
soup = BeautifulSoup(driver.page_source)
s = s + get_investment_info(soup)
except:
s = s + '- '

#抓取企业年报信息,首先点击切换到企业年报信息栏
try:
driver.find_element_by_xpath("//a[@href='#report']").click()
time.sleep(1)
soup = BeautifulSoup(driver.page_source)
s = s + get_report_info(soup)
except:
s = s + '- - '

return s
#************************************************************************
def get_enterprise_data(driver,ename):
#获得搜索结果页面
driver = search_ename(driver,ename)
#从搜索结果中提取出企业链接
eurl = get_enameurl(driver)
#抓取企业链接(eurl)关联的网页，匹配出需要的字段
if eurl != '不存在':
data = get_evalue(driver,eurl)
else:
soup = BeautifulSoup(driver.page_source)
try:
qiyemingcheng = soup.title.string.split('|')[0]
except:
qiyemingcheng = '-'

data = qiyemingcheng+','+'该企业不存在 '
print('成功写入数据 ************** ')
sdata = data.split(' ')
endata = sdata[0].split(' ')

return endata
#************************************************************************

driver = login('QQ账号','QQ密码') #登录函数只需要登录一次
data = get_enterprise_data(driver,'企业名称') #获得企业信息函数，登录之后可以多次调用

查看全文

相关阅读:
不要控制！
【转】iframe页面跳转时，导致父页面滚动！该怎么解决?
【转】XML 特殊字符处理
 【转】使用Log4Net进行日志记录
 【转】JS获取浏览器可视区域的尺寸
 【转】Winform程序未捕获异常解决方法 EventType clr20r3 P1
【转】VMware Tools installation cannot be started manually while Easy Install is in progress.
如何解决安装VMware后郑广电宽带客户端不能登录的问题？
MVC中的M是ViewModel不是EntityModel！
纸上原型--纸上草稿设计--简单高效的沟通方式！

原文地址：https://www.cnblogs.com/wyj690/p/5386984.html