zoukankan      html  css  js  c++  java
  • 办公自动化24-爬取CMB网站理财产品的基本信息、投资报告并格式化输出

    # -*- coding: utf-8 -*-
    """
    Created on Aug 5 2020
    
    @author: lizitingxue
    """
    
    #基础包
    import numpy as np
    import pandas as pd
    import os
    import time
    ##爬虫
    import re
    import requests
    from bs4 import BeautifulSoup
    import urllib.request
    from selenium import webdriver
    ##pdf处理
    import winerror
    from win32com import client as wc
    from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT
    ##word信息处理
    from docx import Document
    from openpyxl import Workbook
    
    #定义函数
    # 定义函数
    def get_code(key_word):
        url = 'http://www.cmbchina.com/cfweb/Personal/Default.aspx'
        js_bottom = "var q=document.documentElement.scrollTop=10000"
        
        browser = webdriver.Chrome()
        browser.get(url)
        time.sleep(3)
        browser.find_element_by_xpath('//*[@id="tbCondition"]/table[1]/tbody/tr[9]/td[2]/ul/li[2]/input[1]').click()
        browser.find_element_by_xpath('//*[@id="inputPrdKey"]').clear()
        browser.find_element_by_xpath('//*[@id="inputPrdKey"]').send_keys(key_word)
        browser.find_element_by_xpath('//*[@id="column_content"]/div[8]/input[1]').click()
        time.sleep(2)
        browser.execute_script(js_bottom)  
        time.sleep(2)
        browser.execute_script(js_bottom)   
        time.sleep(2) 
        ##产品名称和编号
        temp_for_code = browser.find_elements_by_css_selector("[class='leftArea']>[class='row']>[class='code']")
        print(key_word+'系列产品数量为'+str(len(temp_for_code)))
        temp_for_name = browser.find_elements_by_css_selector('[class="inlineBgImage"]>[class="inlineBriefName"]')
        temp_code = []
        temp_name = []
        for j in temp_for_code:
            temp_code.append(j.text)
        for i in temp_for_name:
            temp_name.append(i.text)
        
        ###产品pdf的下载链接
        url_lst = []
        counts = len(browser.find_elements_by_css_selector("[class='viewMore']"))
        # temp = browser.find_elements_by_css_selector("[class='viewMore']")
        for i in range(counts):
            #需要返回上一级
            browser.find_elements_by_css_selector("[class='viewMore']")[i].click()
            time.sleep(3)
            browser.switch_to.window(browser.window_handles[-1])
            browser.find_element_by_id('fmenu_7').click()
            temp_for_pdf = browser.find_elements_by_css_selector("[class='c_title']>[target = '_blank']")
    
            if len(temp_for_pdf)!= 0:
                pdf = temp_for_pdf[0].get_attribute('href')
                url_lst.append(pdf)
            else:
                url_lst.append("未披露")
            browser.close() 
            browser.switch_to.window(browser.window_handles[0]) 
            
        ######保存为excel
        tarDir = folder_path+"//"+key_word #希望生成code列表文件的位置",
        if not os.path.exists(tarDir):
            os.mkdir(tarDir)
        df = pd.DataFrame({"产品名称":temp_name,"产品代码":temp_code,"投资报告下载链接":url_lst})
        print(df)
        df.to_excel(tarDir+"\"+key_word+"产品代码和链接.xlsx",index = False)
        return url_lst
    
    
    
    def get_pdf(link,key_word):
        #保存pdf
        try:
            r = requests.get(link)
            r.encoding = r.apparent_encoding
            html = r.text
            soup = BeautifulSoup(html,'html.parser')
            pro_name = soup.find(name='head',attrs={"id":"Head1"}).text.replace('
    ', '').replace('
    ', '').replace('	', '').replace(' ', '').split("--")[-1]
            link = soup.find_all(name='a',attrs={"href":re.compile(r'.pdf$')})
            if len(link) != 0:
                mylink = link[0]
    #             name = mylink.string
    #             print(name)
                url = mylink.get('href')
    #             print(url)
                filename = pro_name
                filepath = folder_path + '/' +key_word+'/'+ pro_name +".pdf"
                urllib.request.urlretrieve(url, filename=filepath)
                print(pro_name+"报告下载完成~~~~~O(∩_∩)O~")
            else:
                print(pro_name+'报告下载失败01')
        except:
            print(pro_name+"报告下载失败02")
    
    def pdf_to_word(my_dir):
        ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
        os.chdir(my_dir)
        file_lst = [x for x in os.listdir('.') if os.path.isfile(x) and os.path.splitext(x)[1]=='.pdf']
        for f in file_lst:
            src = os.path.abspath(f)#原来的地址
    #         my_pdf = "招商银行零售青葵系列一年定开1号理财计划(产品代码:301025301025B301025C301025D)2020年第2季度报告.pdf"
            try:
                AvDoc = Dispatch("AcroExch.AVDoc")    
    
                if AvDoc.Open(src, ""):            
                    pdDoc = AvDoc.GetPDDoc()
                    jsObject = pdDoc.GetJSObject()
                    new = os.path.join(my_dir, f.split(".")[0]+'.docx')
                    jsObject.SaveAs(new, "com.adobe.acrobat.docx")#新的地址
                    #word格式转换
                    word = wc.Dispatch('Word.Application') 
                    doc = word.Documents.Open(new) 
                    doc.SaveAs(new.split(".")[0]+'T'+".docx",16) 
                    #16对应于下表中的默认docx文件文件
                    doc.Close() 
                    word.Quit()
                    #把原来的doc文件删除
                    if os.path.exists(new):
                        os.remove(new)
    
            except Exception as e:
                print(str(e))
    
            finally:        
                AvDoc.Close(True)
    
                jsObject = None
                pdDoc = None
                AvDoc = None     
                
    def date_trans(start):
        publish_Time = start.replace(' ','')
        array = time.strptime(publish_Time, u"%Y年%m月%d日")
        publishTime = time.strftime("%Y-%m-%d", array)
        return publishTime
    
    
    
    def basic_info(file):
        wb = Workbook()
        sheet = wb.active
        header = ['产品名称', '业绩比较基准', '产品风险评级', '杠杆上限', '理财产品份额总额(亿元)', '理财产品成立日','理财产品到期日',
                  '直接_债券','直接_一般债权','直接_资产支持证券','直接_现金及货币工具','直接_含权债权','直接_权益类','直接_商品及金融衍生品类','直接_合计',
                  '间接_债券','间接_一般债权','间接_资产支持证券','间接_现金及货币工具','间接_含权债权','间接_权益类','间接_商品及金融衍生品类','间接_合计',]
        sheet.append(header)
    
        for doc in file:
            document = Document(folder_path + "\"+key_word + "\"+doc)
            # 读取word中的所有表格
            tables = document.tables
            #基本信息
            table0= tables[0]
            name = table0.cell(1, 1).text.strip().replace(' ','')[4:][:-4]
            compe = table0.cell(6, 1).text.strip().replace(' ','')[2:]
            risk = table0.cell(7,1).text.strip().replace(' ','')
            leverage = table0.cell(8,1).text.strip().replace(' ','')
            total = table0.cell(9,1).text.strip().replace(' ','')
            start = date_trans(table0.cell(10,1).text.strip().replace(' ',''))
            end = date_trans(table0.cell(11,1).text.strip().replace(' ','')[0:11])
            #投资标的
            invest = tables[1]
            print(doc+"投资表格行数"+str(len(invest.rows))+"-"*20)
            temp_direct = []
            temp_indirect = []
            for i in range(8): 
                    try:
                        # 直接投资
                         # name = invest.cell(i+3,1)
                        amount = invest.cell(i+1,2).text.replace("
    ","")
                        temp_direct.append(amount)
                     # 间接投资
                        # name = invest.cell(i+3,1)
                        amount = invest.cell(i+1,4).text.replace("
    ","")
                        temp_indirect.append(amount)
                    except Exception as error:
                        # 捕获异常,也可以用log写到日志里方便查看和管理
                        print(error)
                        continue
            #格式化输出基本信息
            row = [name,compe,risk,leverage,total,start,end]+temp_direct+temp_indirect
            sheet.append(row)    
        wb.save(file_path+'\'+key_word+'基本信息.xlsx')    
    
        
    def nonstan_info(file):
        res = pd.DataFrame(columns = ['产品名称','项目名称','融资客户名称','交易日','项目剩余融资期限(单位:天)','报告期内到期收益率预估(%)','交易结构']) 
        for doc in file:
            document = Document(folder_path + "\"+key_word + "\"+doc)
            # 读取word中的所有表格
            tables = document.tables
            #非标信息
            name = tables[0].cell(1, 1).text.strip().replace(' ','')[4:][:-4]
            nonstan = tables[-2]
        #     print(len(nonstan.rows))#带表头
            nonstan_all = [[] for _ in range(len(nonstan.rows)-1)]
            if nonstan.cell(1,1).text != '-':
                for l in range(len(nonstan.rows)-1):
                    for num in range(6):
                        nonstan_all[l].append(nonstan.cell(l+1,num).text.replace("
    ",'').replace(" ",""))
        #     print(nonstan_all)#不带表头
                print(name + '非标项目数量为{0}'.format(len(nonstan_all)))
        #     print("-"*20)
    
            if nonstan_all != [[]]:
                for x in range(len(nonstan_all)):
                    temp = {'产品名称':name,'项目名称':nonstan_all[x][0],'融资客户名称':nonstan_all[x][1],'交易日':nonstan_all[x][2],'项目剩余融资期限(单位:天)':nonstan_all[x][3],'报告期内到期收益率预估(%)':nonstan_all[x][4],'交易结构':nonstan_all[x][5]}
                    res = res.append(temp,ignore_index=True)
    
        res.to_excel (file_path+'\'+key_word +'非标投资情况.xlsx', index = False, header=True)
    
    ##爬取过程
    
    #定义路径和爬取产品名称
    folder_path = 'F:\PY\爬虫\投资报告备份' 
    key_word = "卓远"
    
    #爬取数据
    url_lst = get_code(key_word)
    
    #下载报告
    for link in url_lst:
        if link != "未披露":
            get_pdf(link,key_word)
    
    #pdf解析
    my_dir = folder_path+'/'+key_word 
    pdf_to_word(my_dir)
    
    #提取投资报告信息
    file_path = folder_path+"\"+key_word
    file = [x for x in os.listdir(file_path) if os.path.isfile(x) and os.path.splitext(x)[1]=='.docx']
    file
    
    #保存信息
    basic_info(file)
    nonstan_info(file)

    有问题欢迎留言哦~~~O(∩_∩)O

  • 相关阅读:
    【转载】Hibernate---在Hibernate中获取数据方式与缓存使用
    【转载】Hibernate ORM 新特性之 Service(Registry)
    【转载】Hibernate三种状态的区分,以及save,update,saveOrUpdate,merge等的使用
    【转载】Java Cache系列之Cache概述和Simple Cache
    C#进行AutoCAD2014二次开发的注意事项
    WIN10更新后重新激活CAD
    获取子图元
    常用语句
    向命令行发送命令
    DatagridView内容自动换行和换行符换行
  • 原文地址:https://www.cnblogs.com/lizitingxue/p/13442092.html
Copyright © 2011-2022 走看看