selenium 练习
# coding="utf-8"
from selenium import webdriver
from lxml import etree
import json
import time
class Tiantian_spider():
def __init__(self):
self.driver = webdriver.PhantomJS()
self.html = None
self.next_page = True
# 1 发起请求
def parser_url(self):
if self.next_page :
# 点击页面进行翻页
self.driver.find_element_by_xpath("//div[@id ='pagebar']/label[last()]").click()
time.sleep(4) # 网页返回数据需要时间
self.html = self.driver.page_source
# 2 解析数据
def parser_data(self):
rel =[]
html = etree.HTML(self.html)
tr_list = html.xpath("//table[@id ='dbtable']//tbody/tr")
next_page = html.xpath("//div[@id ='pagebar']//label[last()]")
# print(next_page)
for tr in tr_list:
dic = {}
tds =tr.xpath("./td")
dic['序号'] = tds[1].text
print(dic["序号"])
dic["基金代码"] = tds[2].xpath("./a/text()")[0]
dic["基金简称"] = tds[3].xpath("./a/@title")[0]
dic["日期"] = tds[4].text
dic["单位净增"] = tds[5].text
dic["累计净值"] = tds[6].text
dic["日增长率"] = tds[7].text
dic["近一周"] = tds[8].text
dic["近1月"] = tds[9].text
dic["近3月"] = tds[10].text
dic["近6月"] = tds[11].text
rel.append(dic)
return rel,next_page
# 数据保存
def save_data(self,data):
with open("天天基金.txt","a",encoding="utf-8") as f:
json.dump(data,f,ensure_ascii=False,indent=2)
# print("保存成功")
# 翻页控制器
def over_page(self,next_page):
kw = next_page[0].xpath("./label[contains(@class,'end')]")
print(kw)
flage = True if len(kw)==0 else False
return flage
def run(self,url):
# 1 发起请求
# 2 获取数据,解析数据
self.driver.get(url)
self.html = self.driver.page_source
while self.next_page:
data ,next_page= self.parser_data()
# 3 保存数据
self.save_data(data)
# 4 翻页继续爬取
self.next_page = self.over_page(next_page)
self.parser_url()
# print("程序执行完毕!!")
self.driver.quit()
if __name__ == '__main__':
url = "http://fund.eastmoney.com/data/fundranking.html#tall;c0;r;szzf;pn50;ddesc;qsd20200106;qed20210106;qdii;zq;gg;gzbd;gzfs;bbzt;sfbb"
tiantian = Tiantian_spider()
tiantian.run(url)