zoukankan      html  css  js  c++  java
  • 19、练习:使用selenium与Phantom JS爬取天天基金部分数据

    selenium 练习

    # coding="utf-8"
    
    from selenium import webdriver
    from  lxml import etree
    import json
    import time
    
    class Tiantian_spider():
        def __init__(self):
            self.driver = webdriver.PhantomJS()
            self.html = None
            self.next_page = True
        
        
        # 1 发起请求
        def parser_url(self):
            if self.next_page :
                # 点击页面进行翻页
                self.driver.find_element_by_xpath("//div[@id ='pagebar']/label[last()]").click()
                time.sleep(4)  # 网页返回数据需要时间
                self.html = self.driver.page_source
    
    
        # 2 解析数据
        def parser_data(self):
            rel =[]
            html = etree.HTML(self.html)
            tr_list = html.xpath("//table[@id ='dbtable']//tbody/tr")
            next_page = html.xpath("//div[@id ='pagebar']//label[last()]")
            # print(next_page)
            for tr in tr_list:
                dic = {}
                tds =tr.xpath("./td")
                dic['序号'] = tds[1].text
                print(dic["序号"])
                dic["基金代码"] = tds[2].xpath("./a/text()")[0]
                dic["基金简称"] = tds[3].xpath("./a/@title")[0]
                dic["日期"] = tds[4].text
                dic["单位净增"] = tds[5].text
                dic["累计净值"] = tds[6].text
                dic["日增长率"] = tds[7].text
                dic["近一周"] = tds[8].text
                dic["近1月"] = tds[9].text
                dic["近3月"] = tds[10].text
                dic["近6月"] = tds[11].text
                rel.append(dic)
            return rel,next_page
    
    
        # 数据保存
        def save_data(self,data):
            with open("天天基金.txt","a",encoding="utf-8") as f:
                json.dump(data,f,ensure_ascii=False,indent=2)
            # print("保存成功")
    
    
        # 翻页控制器
        def over_page(self,next_page):
            kw = next_page[0].xpath("./label[contains(@class,'end')]")
            print(kw)
            flage = True if len(kw)==0 else False
            return flage
    
    
        def run(self,url):
            # 1 发起请求
            # 2 获取数据,解析数据
            self.driver.get(url)
            self.html = self.driver.page_source
            while self.next_page:
                data ,next_page= self.parser_data()
                # 3 保存数据
                self.save_data(data)
                # 4 翻页继续爬取
                self.next_page = self.over_page(next_page)
                self.parser_url()
                # print("程序执行完毕!!")
            self.driver.quit()
    
    
    if __name__ == '__main__':
        url = "http://fund.eastmoney.com/data/fundranking.html#tall;c0;r;szzf;pn50;ddesc;qsd20200106;qed20210106;qdii;zq;gg;gzbd;gzfs;bbzt;sfbb"
        tiantian = Tiantian_spider()
        tiantian.run(url)
    
    
    
    
    
  • 相关阅读:
    SSH 密钥类型的的选择
    VMware 默认的虚拟交换机关系与VMnet设置
    路由器端口映射不生效的解决方法
    优先使用对象组合,而不是类继承
    权限管理系统之软件注册模块
    如何正确安装phpDocumentor
    IE下cookie跨域问题
    Linux下Apache无法解析.php文件
    用 memcache 来存储 session
    windows7 下 phpunit 安装
  • 原文地址:https://www.cnblogs.com/hefany/p/14245212.html
Copyright © 2011-2022 走看看