zoukankan      html  css  js  c++  java
  • python3 IEDriver抓取时报数据

        最近做了测试抓取XX时报的数据,由于需要事先登录,并且有验证码,关于验证码解决有两个途径:一是利用打码平台,其原理是把验证码的图片上传发送给打码平台,

      然后返回其验证码。二就是自己研究验证码技术问题。这个有时间再研究。

        目前主要是测试从XX时报抓取数据,目前暂时用了笨方法,利用人工介入,输入验证码。

        登录界面:

        

        具体代码如下:

      

    #coding=utf-8
    import os
    import re
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    from selenium.webdriver.common.action_chains import ActionChains
    import collections
    import mongoDbBase
    import numpy
    import imagehash
    from PIL import Image,ImageFile
    import datetime
    class finalNews_IE:
        def __init__(self,strdate,logonUrl,firstUrl,keyword_list,exportPath,codedir):
            self.iniDriver()
            self.db = mongoDbBase.mongoDbBase()
            self.date = strdate
            self.firstUrl = firstUrl
            self.logonUrl = logonUrl
            self.keyword_list = keyword_list
            self.exportPath = exportPath
            self.codedir = codedir
            self.hash_code_dict ={}
    
    
        def iniDriver(self):
            # 通过配置文件获取IEDriverServer.exe路径
            IEDriverServer = "C:Program FilesInternet ExplorerIEDriverServer.exe"
            os.environ["webdriver.ie.driver"] = IEDriverServer
            self.driver = webdriver.Ie(IEDriverServer)
    
        def WriteData(self, message, fileName):
            fileName = os.path.join(os.getcwd(), self.exportPath + '/' + fileName)
            with open(fileName, 'a') as f:
                f.write(message)
    
        # 获取图片文件的hash值
        def get_ImageHash(self,imagefile):
            hash = None
            if os.path.exists(imagefile):
                with open(imagefile, 'rb') as fp:
                    hash = imagehash.average_hash(Image.open(fp))
            return hash
    
        # 点降噪
        def clearNoise(self, imageFile, x=0, y=0):
            if os.path.exists(imageFile):
                image = Image.open(imageFile)
                image = image.convert('L')
                image = numpy.asarray(image)
                image = (image > 135) * 255
                image = Image.fromarray(image).convert('RGB')
                # save_name = "D:workpython36_crawlVeriycodemode_5590.png"
                # image.save(save_name)
                image.save(imageFile)
                return image
    
        #切割验证码
        # rownum:切割行数;colnum:切割列数;dstpath:图片文件路径;img_name:要切割的图片文件
        def splitimage(self, imagePath,imageFile,rownum=1, colnum=4):
            img = Image.open(imageFile)
            w, h = img.size
            if rownum <= h and colnum <= w:
                print('Original image info: %sx%s, %s, %s' % (w, h, img.format, img.mode))
                print('开始处理图片切割, 请稍候...')
    
                s = os.path.split(imageFile)
                if imagePath == '':
                    dstpath = s[0]
                fn = s[1].split('.')
                basename = fn[0]
                ext = fn[-1]
    
                num = 1
                rowheight = h // rownum
                colwidth = w // colnum
                file_list =[]
                for r in range(rownum):
                    index = 0
                    for c in range(colnum):
                        # (left, upper, right, lower)
                        # box = (c * colwidth, r * rowheight, (c + 1) * colwidth, (r + 1) * rowheight)
                        if index < 1:
                            colwid = colwidth + 6
                        elif index < 2:
                            colwid = colwidth + 1
                        elif index < 3:
                            colwid = colwidth
    
                        box = (c * colwid, r * rowheight, (c + 1) * colwid, (r + 1) * rowheight)
                        newfile = os.path.join(imagePath, basename + '_' + str(num) + '.' + ext)
                        file_list.append(newfile)
                        img.crop(box).save(newfile, ext)
                        num = num + 1
                        index += 1
                return file_list
    
        def compare_image_with_hash(self, image_hash1,image_hash2, max_dif=5):
            """
                    max_dif: 允许最大hash差值, 越小越精确,最小为0
                    推荐使用
                    """
            dif = image_hash1 - image_hash2
            # print(dif)
            if dif < 0:
                dif = -dif
            if dif <= max_dif:
                return True
            else:
                return False
    
        # 截取验证码图片
        def savePicture(self):
            # self.driver.get(self.logonUrl)
            # self.driver.maximize_window()
            # time.sleep(2)
    
            self.driver.save_screenshot(self.codedir +"Temp.png")
            checkcode = self.driver.find_element_by_id("checkcode")
            location = checkcode.location  # 获取验证码x,y轴坐标
            size = checkcode.size  # 获取验证码的长宽
            rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
                      int(location['y'] + size['height']))  # 写成我们需要截取的位置坐标
            i = Image.open(self.codedir +"Temp.png")  # 打开截图
            result = i.crop(rangle)  # 使用Image的crop函数,从截图中再次截取我们需要的区域
            filename = datetime.datetime.now().strftime("%M%S")
            filename =self.codedir +"Temp_code.png"
            result.save(filename)
            self.clearNoise(filename)
            file_list = self.splitimage(self.codedir,filename)
            time.sleep(3)
            verycode =''
            for f in file_list:
                imageHash = self.get_ImageHash(f)
                if imageHash:
                    for h, code in self.hash_code_dict.items():
                        flag = self.compare_image_with_hash(imageHash, h, 0)
                        if flag:
                            # print(code)
                            verycode += code
                            break
    
            print(verycode)
            return verycode
            # print(verycode)
            # self.driver.close()
    
        def getVerycode(self, txtFile="verycode.txt"):
            f = open(txtFile, 'r')
            result = f.read()
            return result
    
        def longon(self):
            for f in range(0,10):
                for l in range(1,5):
                    file = os.path.join(self.codedir, "codeLibrarycode" +  str(f) + '_'+str(l) + ".png")
                    # print(file)
                    hash = self.get_ImageHash(file)
                    self.hash_code_dict[hash]= str(f)
    
            flag = True
            try:
                self.driver.get(self.logonUrl)
                self.driver.maximize_window()
                time.sleep(2)
                verycode = self.savePicture()
                if len(verycode)==4:
                    accname = self.driver.find_element_by_id("username")
                    # accname = self.driver.find_element_by_id("//input[@id='username']")
                    accname.send_keys('ctrchina')
    
                    accpwd = self.driver.find_element_by_id("password")
                    # accpwd.send_keys('123456')
                    # code = self.getVerycode()
                    checkcode = self.driver.find_element_by_name("checkcode")
                    checkcode.send_keys(verycode)
                    submit = self.driver.find_element_by_name("button")
                    submit.click()
                else:
                    flag = False
            except Exception as e1:
                message = str(e1.args)
                flag = False
            return flag
    
        # 获取版面链接及关键字
        def saveUrls(self):
            error = ''
            while True:
                flag = self.longon()
                time.sleep(2)
                if flag:
                    try:
                        codefault = self.driver.find_element_by_xpath("//table[@class='table_login']/tbody/tr/td/font")
                        if codefault:
                            continue
                    except Exception as e1:
                        pass
                    break
            try:
                time.sleep(2)
                self.driver.get(self.firstUrl)
                self.driver.maximize_window()
                # urllb = "//div[@id='pageLink']/ul/div/div/a"
                urllb = "//a[@id='pageLink']"
                time.sleep(2)
                elements = self.driver.find_elements_by_xpath(urllb)
                url_layout_dict = collections.OrderedDict()
                for element in elements:
                    layout = element.text
                    # print(layout)
                    if len(layout) == 0:
                        continue
                    # layout = txt[txt.find(":") + 1:]
                    link =  element.get_attribute("href")
                    print(link)
                    if link not in url_layout_dict:
                        url_layout_dict[link] = layout
                index = 0
                for sub_url,layout in url_layout_dict.items():
                    if index==0:
                        sub_url=""
                    print(index)
                    self.getArticleLink(sub_url,layout)
                    index+=1
            except Exception as e1:
                print("saveUrlsException")
                print("saveUrlsException:Exception" + str(e1.args))
    
    
        def getArticleLink(self, url,layout):
            error = ''
            try:
                if url:
                    self.driver.get(url)
                    self.driver.maximize_window()
                time.sleep(2)
                dt = datetime.datetime.now().strftime("%Y.%m.%d")
                urllb = "//div[@id='titleList']/ul/li/a"
                elements = self.driver.find_elements_by_xpath(urllb)
                url_layout_dict = {}
                for element in elements:
                    txt = element.text
                    txt = txt[txt.rfind(")") + 1:len(txt)]
                    if txt.find("无标题") > -1 or txt.find("公 告") > -1 or txt.find("FINANCIAL NEWS") > -1 or txt.find(dt) > -1:
                        continue
                    link = element.get_attribute("href")
                    print(link)
                    url_layout_dict[link] = layout
                self.db.SavefinalUrl(url_layout_dict,self.date)
            except Exception as e1:
                print("getArticleLink:Exception")
                print("getArticleLink:Exception" + str(e1.args))
                error = e1.args
    
    
        def catchdata(self):
    
            rows = self.db.GetfinalUrl(self.date)
            lst = []
            for row in rows:
                lst.append(row)
            print("rowcount:"+str(len(lst)))
            count =1
            for row in lst:
                url = row['url']
                layout = row['layout']
                try:
                    self.driver.get(url)
                    self.driver.maximize_window()
                    time.sleep(1)
                    title = ""
    
                    # t1 = doc("div[class='text_c']")
                    element = self.driver.find_element_by_class_name("text_c")
                    title = element.find_element_by_css_selector("h3").text
                    st = element.find_element_by_css_selector("h1").text
                    if st:
                        title += "
    " + st
                    st = element.find_element_by_css_selector("h2").text
                    if st:
                        title += "
    " + st
    
                    st = element.find_element_by_css_selector("h4").text
                    if st:
                        if st.find("记者") == -1:
                            title += "
    " + st
                        # else:
                        #     author = st.replace("记者","").replace("本报","").strip()
                    elements = self.driver.find_elements_by_xpath("//div[@id='ozoom']/p")
    
                    content = ""
    
                    key = ""
                    index = 0
                    author = ''
                    for element in elements:
                        txt = element.text.strip().replace("
    ", "")
                        content += txt
                        if index == 0:
                            if txt.find("记者") > 0 and txt.find("报道") > 0:
                                author = txt[txt.find("记者") + 2:txt.find("报道")]
                            elif txt.find("记者") > 0 and txt.find("报道") == -1:
                                author = txt[txt.find("记者") + 2:len(txt)]
                            elif txt.find("记者") == -1 and txt.find("报道") == -1:
                                author = txt.strip()
                        index += 1
    
                    for k in self.keyword_list:
                        if content.find(k)>-1 or title.find(k)>-1:
                            key+=k+","
                    if key:
                        key = key[0:len(key)-1]
                    author = author.replace("记者", "").strip()
                    if len(author)>6:
                        author = ""
                    print(count)
                    print(layout)
                    print(url)
                    print(title)
                    print(author)
                    count+=1
                    # print(content)
                    self.db.updatefinalUrl(url)
                    self.db.SavefinalData(self.date,layout,url,title,author,key,content)
                except Exception as e1:
                    error = e1.args
            self.driver.close()
    
        def export(self):
            rows = self.db.GetfinalData(self.date)
            lst = []
            for dataRow1 in rows:
                lst.append(dataRow1)
            count =1
            # dt = datetime.datetime.now().strftime("%Y-%m-%d")
            fileName = '金融时报_' + self.date + '.csv'
            header = "发表日期,关键字,作者,全文字数,标题,版面,链接,正文"
            if len(lst)>0:
                self.WriteData(header, fileName)
    
            for dataRow in lst:
                date = str(dataRow['date'])
                layout = str(dataRow['layout'])
                url = str(dataRow['url'])
                title = str(dataRow['title']).replace(",","").replace("
    "," ")
                author = str(dataRow['author']).replace(",","")
                key = str(dataRow['key']).replace(",","")
                wordcount = str(dataRow['wordcount'])
                content = str(dataRow['content']).replace(",","").replace("
    "," ")
    
                # txt = "
    %s,%s,%s,%s,%s,%s" % (
                #     date, key, title, author, wordcount, url)
                txt = "
    %s,%s,%s,%s,%s,%s,%s,%s" % (
                    date, key, author, wordcount, title,layout, url, content)
                try:
                    self.WriteData(txt, fileName)
                except Exception as e1:
                    print(str(e1))
                print(count)
                count += 1
    
    
    #
    # dt = datetime.datetime.now().strftime("%Y-%m-%d")
    # ym = datetime.datetime.now().strftime("%Y-%m")
    # day = datetime.datetime.now().strftime("%d")
    #
    # codepath='E:/python36_crawl/mediaInfo/verycode.txt'
    #
    # logonUrl="http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
    # # firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
    # firsturl="http://epaper.financialnews.com.cn/jrsb/html/"+ym+"/"+day+"/node_2.htm"
    # # print(firsturl)
    # keyword_list ="银保监会,央行,中国银行,中行,中银".split(",")
    # exportPath="E:/News"
    # codedir='E:python36_crawlVeriycode'
    # obj = finalNews_IE(dt,logonUrl,firsturl,keyword_list,exportPath,codedir)
    # # obj.saveUrls()
    # obj.catchdata()
    # obj.export()
    # # obj.savePicture()
    View Code

      采集时报2

     

      layoutElement.get_attribute("onclick")

      

      layoutLink = layoutElement.get_attribute("onclick")
    #coding=utf-8
    import os
    import re
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    from selenium.webdriver.common.action_chains import ActionChains
    import collections
    import mongoDbBase
    import datetime
    import numpy
    from PIL import Image
    import RClient
    class firstfinal:
        def __init__(self, strdate, firstUrl, keyword_list, exportPath,dirpath):
    
            self.db = mongoDbBase.mongoDbBase()
            self.date = strdate
            self.firstUrl = firstUrl
            self.keyword_list = keyword_list
            self.exportPath = exportPath
            self.dirpath = dirpath
            self.rclient = RClient.RClient()
    
    
        def iniDriver(self):
            # 通过配置文件获取IEDriverServer.exe路径
            IEDriverServer = "C:Program Filesinternet explorerIEDriverServer.exe"
            os.environ["webdriver.ie.driver"] = IEDriverServer
            self.driver = webdriver.Ie(IEDriverServer)
    
        def WriteData(self, message, fileName):
            fileName = os.path.join(os.getcwd(), self.exportPath + '/' + fileName)
            with open(fileName, 'a') as f:
                f.write(message)
    
        def getVerycode(self, txtFile="verycode.txt"):
            f = open(txtFile, 'r')
            result = f.read()
            return result
            # 点降噪
    
        def clearNoise(self, imageFile, x=0, y=0):
            if os.path.exists(imageFile):
                image = Image.open(imageFile)
                image = image.convert('L')
                image = numpy.asarray(image)
                image = (image > 135) * 255
                image = Image.fromarray(image).convert('RGB')
                # save_name = "D:workpython36_crawlVeriycodemode_5590.png"
                # image.save(save_name)
                image.save(imageFile)
                return image
        def savePicture(self):
            # self.iniDriver()
            # self.driver.get(self.firstUrl)
            # self.driver.maximize_window()
    
            logon = self.driver.find_element_by_xpath("//div[@class='topMenu']/div[2]/a")  # 索引从1开始
            # href = logon.get_attribute("href")
            # self.driver.execute_script(href)
            logon.click()
            # self.driver.maximize_window()
            time.sleep(2)
            checkcode = self.driver.find_element_by_id("Verify")
            temppng = "E:python36_crawlVeriycodeTemp.png"
    
            self.driver.save_screenshot("E:python36_crawlVeriycodeTemp.png")
            location = checkcode.location  # 获取验证码x,y轴坐标
            size = checkcode.size  # 获取验证码的长宽
            rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
                      int(location['y'] + size['height']))  # 写成我们需要截取的位置坐标
            # i = Image.open("D:workpython36_crawlVeriycodecodeTemp.png")  # 打开截图
            i = Image.open(temppng)
            result = i.crop(rangle)  # 使用Image的crop函数,从截图中再次截取我们需要的区域
            # imagefile = datetime.datetime.now().strftime("%Y%m%d%H%M%S")+".png"
            # imagefile = os.path.join("D:workpython36_crawlVeriycodecode",imagefile)
            result.save(temppng)
            # self.driver.close()
            # time.sleep(2)
            return temppng
    
    
        def longon(self):
    
            self.iniDriver()
            self.driver.get(self.firstUrl)
            self.driver.maximize_window()
    
    
            logon = self.driver.find_element_by_xpath("//div[@class='topMenu']/div[2]/a")#索引从1开始
            # href = logon.get_attribute("href")
            # self.driver.execute_script(href)
            logon.click()
            self.driver.maximize_window()
            time.sleep(2)
            # if os.path.exists(self.codepath):
            #     os.system(self.codepath)
    
            # code = self.getVerycode()
            accname = self.driver.find_element_by_name("username")
            # accname = self.driver.find_element_by_id("//input[@id='username']")
            accname.send_keys('ctrchina')
    
    
            # time.sleep(15)
            accpwd = self.driver.find_element_by_name("password")
            # 在服务器上浏览器记录密码了,就不需要设置了
            accpwd.send_keys('123456')
    
            checkcode = self.driver.find_element_by_name("code")
            temppng = self.savePicture()
            code = self.rclient.test(temppng)
            checkcode.send_keys(code)
    
            submit = self.driver.find_element_by_xpath("//div[@class='UserFrom']/div[8]/button")
            submit.click()
    
            time.sleep(4)
            # self.driver.refresh()
    
            # 获取版面链接及关键字
    
        def catchData(self):
            flag = True
            try:
    
                layoutlb = "//ul[@class='BNameList']/li/a"
                artclelb = "//div[@id='SetContent']/ul/li/a"
                contentlb = "//div[@id='SetContent']/ul/li/a"
                layoutElements = self.driver.find_elements_by_xpath(layoutlb)
                layoutCount = len(layoutElements)
                layoutIndex = 0
                layout = ''
                # 版面循环
                print("layoutCount="+str(layoutCount))
                while layoutIndex<layoutCount:
                    if layoutIndex >0:
                        self.driver.get(self.firstUrl)
                        self.driver.maximize_window()
                        layoutElements = self.driver.find_elements_by_xpath(layoutlb)
                        layoutElement = layoutElements[layoutIndex]
                        layoutLink = layoutElement.get_attribute("onclick")
                        self.driver.execute_script(layoutLink)
                    else:
                        layoutElement = layoutElements[layoutIndex]
                    layout = layoutElement.text
                    print(layout)
                    articleElements = self.driver.find_elements_by_xpath(artclelb)
                    articleCount = len(articleElements)
                    print("articleCount=" + str(articleCount))
                    articleIndex = 0
                    # 每个版面中文章列表循环
                    while articleIndex < articleCount:
                        if articleIndex > 0 :
                            self.driver.get(self.firstUrl)
                            self.driver.maximize_window()
                            layoutElements = self.driver.find_elements_by_xpath(layoutlb)
                            layoutElement = layoutElements[layoutIndex]
                            layoutLink = layoutElement.get_attribute("onclick")
                            self.driver.execute_script(layoutLink)
    
                        elements = self.driver.find_elements_by_xpath(contentlb)
                        sublink = elements[articleIndex].get_attribute("onclick")  #
                        title = elements[articleIndex].text
                        print(title)
                        self.driver.execute_script(sublink)
                        author = self.driver.find_element_by_id("Setauthor").text
                        subE = self.driver.find_elements_by_xpath("//div[@id='SetContent']/p")
                        content = ''
                        for se in subE:
                            content += se.text
                        key = ''
                        for k in self.keyword_list:
                            if content.find(k) > -1 or title.find(k) > -1:
                                key += k + ","
                        if key:
                            key = key[0:len(key) - 1]
                        print(author)
                        # print(content)
                        print(key)
                        print('
    ')
                        articleIndex += 1
                        self.db.SaveFirsFinalData(self.date, layout, self.firstUrl, title, author, key, content)
                    layoutIndex+=1
    
            except Exception as e1:
                error = e1.args
                flag = True
    
        def export(self):
            try:
                rows = self.db.GetFirsFinalData(self.date)
                lst = []
                for dataRow1 in rows:
                    lst.append(dataRow1)
                count = 1
                dt = datetime.datetime.now().strftime("%Y-%m-%d")
                fileName = '第一财经日报_' + self.date + '.csv'
                header = "发表日期,关键字,作者,全文字数,标题,版面,链接,正文"
                if len(lst)>0:
                    self.WriteData(header, fileName)
                # 所有的文章链接都是一样的
                url = 'http://buy.yicai.com/read/index/id/5.html'
                for dataRow in lst:
                    date = str(dataRow['date'])
                    layout = str(dataRow['layout'])
                    # url = str(dataRow['url'])
    
                    title = str(dataRow['title']).replace(",", "").replace("
    ", " ")
                    author = str(dataRow['author']).replace(",", "")
                    key = str(dataRow['key']).replace(",", "")
                    wordcount = str(dataRow['wordcount'])
                    content = str(dataRow['content']).replace(",", "").replace("
    ", " ")
    
                    # txt = "
    %s,%s,%s,%s,%s,%s" % (
                    #     date, key, title, author, wordcount, url)
                    txt = "
    %s,%s,%s,%s,%s,%s,%s,%s" % (
                        date, key, author, wordcount, title, layout, url, content)
                    try:
                        self.WriteData(txt, fileName)
                    except Exception as e1:
                        print(str(e1))
                    print(count)
                    count += 1
            except Exception as e1:
                error = e1.args
    
        def test(self):
            dt = datetime.datetime.now().strftime("%Y-%m-%d")
            # dt="2018-10-08"
            dirpath = "E:python36_crawl"
            # codepath= os.path.join(dirpath,"mediaInfoVerycode.txt")
            # codepath='E:/python36_crawl/mediaInfo/verycode.txt'
            # file_list = os.listdir("D:workpython36_crawlVeriycodecode")
            # firsturl="http://buy.yicai.com/read/index/id/5.html"
            firsturl = 'http://buy.yicai.com/read/index/id/5.html'
            keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
            exportPath = "E:/News"
            obj = firstfinal(dt, firsturl, keyword_list, exportPath, dirpath)
            obj.longon()
            obj.catchData()
            obj.export()
    
    # dt = datetime.datetime.now().strftime("%Y-%m-%d")
    # # dt="2018-10-08"
    # dirpath ="E:python36_crawl"
    # # codepath= os.path.join(dirpath,"mediaInfoVerycode.txt")
    # # codepath='E:/python36_crawl/mediaInfo/verycode.txt'
    # # file_list = os.listdir("D:workpython36_crawlVeriycodecode")
    # # firsturl="http://buy.yicai.com/read/index/id/5.html"
    # firsturl='http://buy.yicai.com/read/index/id/5.html'
    # keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
    # exportPath = "E:/News"
    # obj = firstfinal(dt, firsturl, keyword_list, exportPath,dirpath)
    # obj.longon()
    # obj.catchData()
    # # while True:
    # #     obj.savePicture()
    # obj.export()
    View Code
    # coding=utf-8
    import datetime
    import finalNews_IE
    import firstfinal
    import Mail
    import time
    import os
    
    # def WriteData(message, fileName):
    #     fileName = os.path.join(os.getcwd(),  'mailflag.txt')
    #     with open(fileName) as f:
    #         f.write(message)
    def run():
        attachmentFileDir ="E:\News"
        mailflagfile = os.path.join(os.getcwd(), 'mailflag.txt')
        while True:
            date = datetime.datetime.now()
            strtime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(strtime + " 正常循环")
            dt = datetime.datetime.now().strftime("%Y-%m-%d")
            ym = datetime.datetime.now().strftime("%Y-%m")
            day = datetime.datetime.now().strftime("%d")
    
            fileName = '金融时报_' + dt + '.csv'
            fileName = os.path.join(attachmentFileDir, fileName)
    
            firstfileName = '第一财经日报_' + dt + '.csv'
            firstfileName = os.path.join(attachmentFileDir, firstfileName)
    
            if not os.path.exists(fileName):
                # 采集金融时报数据
                logonUrl = "http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
                # firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
                firsturl = "http://epaper.financialnews.com.cn/jrsb/html/" + ym + "/" + day + "/node_2.htm"
                # print(firsturl)
                keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
                exportPath = "E:/News"
                codedir = 'E:python36_crawlVeriycode'
                obj = finalNews_IE.finalNews_IE(dt, logonUrl, firsturl, keyword_list, exportPath, codedir)
                obj.saveUrls()
                obj.catchdata()
                obj.export()
            if not os.path.exists(firstfileName):
                # 采集第一采集日报数据
                dirpath = "E:python36_crawl"
                firsturl = 'http://buy.yicai.com/read/index/id/5.html'
                keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
                exportPath = "E:/News"
                obj = firstfinal.firstfinal(dt, firsturl, keyword_list, exportPath, dirpath)
                obj.longon()
                obj.catchData()
                obj.export()
            if date.strftime('%H:%M')=="08:50":
                # 发送邮件
                obj = Mail.Mail()
                obj.test()
                # WriteData(dt,mailflagfile)
                time.sleep(100)
            else:
                time.sleep(10)
    
    
    run()
    
    # try:
    
        # dt = datetime.datetime.now().strftime("%Y-%m-%d")
        # ym = datetime.datetime.now().strftime("%Y-%m")
        # day = datetime.datetime.now().strftime("%d")
        # # 采集金融时报数据
        # logonUrl = "http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
        # # firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
        # firsturl = "http://epaper.financialnews.com.cn/jrsb/html/" + ym + "/" + day + "/node_2.htm"
        # # print(firsturl)
        # keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
        # exportPath = "E:/News"
        # codedir = 'E:python36_crawlVeriycode'
        # obj = finalNews_IE.finalNews_IE(dt, logonUrl, firsturl, keyword_list, exportPath, codedir)
        # obj.saveUrls()
        # obj.catchdata()
        # obj.export()
        #
        # # 采集第一采集日报数据
        # dirpath = "E:python36_crawl"
        # firsturl = 'http://buy.yicai.com/read/index/id/5.html'
        # keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
        # exportPath = "E:/News"
        # obj = firstfinal.firstfinal(dt, firsturl, keyword_list, exportPath, dirpath)
        # obj.longon()
        # obj.catchData()
        # obj.export()
    
        # 发送邮件
    #     obj = Mail.Mail()
    #     obj.test()
    # except Exception as e1:
    #     print(str(e1))
    View Code
  • 相关阅读:
    牛客练习赛53 B题调和级数
    装备购买(线性基)
    杨氏矩阵与勾长公式
    南昌邀请赛B题(拉格朗日插值)
    徐州网络赛补题
    __int128 输入输出模板
    51 nod1067 Bash游戏 V2(sg函数打表)
    堆优化的dijkstra算法
    ST表求区间最值
    Tree Reconstruction
  • 原文地址:https://www.cnblogs.com/shaosks/p/9697977.html
Copyright © 2011-2022 走看看