zoukankan      html  css  js  c++  java
  • 爬取东方财富财报

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    import time
    import pandas as pd
    
    browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
    
    # browser = webdriver.Chrome()
    browser.maximize_window()   #最大化窗口
    wait = WebDriverWait(browser,10)     #最大等待时间
    def index_page(page):
        '''
        爬取页面数据
        :param page:页数
        :return:
        '''
        # url = "http://data.eastmoney.com/bbsj/201806/lrb.html"
        url = "http://data.eastmoney.com/bbsj/202006/yjkb.html"
        try:
            browser.get(url=url)
            print("正在爬去第%s页"%page)
            #判断是否是第一页。如果大于1,则输入跳转,否则加载完成
            if page>1:
                #确定页书输入框
                input = wait.until(EC.presence_of_element_located((By.ID,"PageContgopage")))
                # input.click()
                input.clear()
                input.send_keys(page)
                submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#PageCont >a.btn_link")))
                submit.click()
                time.sleep(2)
            wait.until(EC.presence_of_element_located((By.ID,"dt_1")))
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#PageCont>span.at"),str(page)))
            element = browser.find_element_by_css_selector("#dt_1")
            all_td = element.find_elements_by_tag_name("td")
            lst = []
            for td in all_td:
                lst.append(td.text)
            #查看数据总共有多少列
            # print(lst)
            # exit()
            col = len(element.find_elements_by_css_selector("tr:first-child td"))
            lst = [lst[i:i+col] for i in range(0,len(all_td),col)]
            #获取连接
            # print(lst)
            lst_link =[]
            links = element.find_elements_by_css_selector("#dt_1 a.red")
            for link in links:
                link_url = link.get_attribute("href")
                lst_link.append(link_url)
    
            # columns = ["序号", "股票代码", "股票简称", "相关", '净利润(元)', "净利润同比(%)", "营业总收入(元)", "营业总收入同比",
            #    "营业支出(元)", "销售费用(元)", "管理费用(元)", "财务费用(元)", "营业总支出", "营业利润(元)",
            #    "利润总额(元)", "公告日期"]
            columns = ["序号", "股票代码", "股票简称", "相关", "每股收益", "营业收入(元)", "去年同期(元)", "同比增长",
                       "季度环比增长", "净利润", "去年同期", "同比增长", "季度环比增长", "每股净资产", "净资产收益率", "所处行业", "公告日期"]
    
            df_table = pd.DataFrame(lst, columns=columns)
            df_table["url"] = lst_link
            # print(df_table)
            # exit()
            return df_table
        except Exception:
            return None
    
    # def get_data():
    #     '''
    #     获取单页数据
    #     :return:
    #     '''
    
    
    def main():
        all_data = pd.DataFrame()
        for page in range(1,5):
            df_table = index_page(page)
            all_data = pd.concat([all_data,df_table])
            # print(all_data)
        all_data.to_excel("2020年6月上市公司财报数据.xlsx")
    main()
    View Code

    爬取东方财富财报数据

  • 相关阅读:
    RabbitMQ实战(文摘)
    dex2jar 和 jd-gui 的安装与使用(转)
    asp.net core跨平台开发从入门到实战文摘
    hashCode() 和equals() 区别和作用(转)
    B+树和LSM比较(转)
    C#并发集合(转)
    文档docsify
    如何熟悉一个系统?(内含知识大图)
    新浪微博应对弹性扩容的架构演进
    iftop非交互式监控流量来源和去向
  • 原文地址:https://www.cnblogs.com/Ezhizen/p/13590116.html
Copyright © 2011-2022 走看看