zoukankan      html  css  js  c++  java
  • selenium-浙江新闻网

      1 # coding: utf-8
      2 import html
      3 import json
      4 import os
      5 import re
      6 import sys
      7 from datetime import datetime
      8 from pprint import pprint
      9 
     10 import parsel
     11 import requests
     12 import schedule
     13 from selenium.webdriver import Chrome
     14 from selenium.webdriver.chrome.options import Options
     15 import random
     16 import time
     17 import threading
     18 from concurrent.futures import ThreadPoolExecutor
     19 
     20 
     21 class WebDriver():
     22     def __init__(self, url):
     23         self.url = url
     24         self.option = Options()
     25         self.option.add_argument('--disable-blink-features=AutomationControlled')
     26         self.option.add_argument('headless')
     27         self.option.add_argument('--disable-gpu')
     28         self.web = Chrome(options=self.option)
     29 
     30     def run(self):
     31         self.web.get(url=self.url)
     32         self.web.maximize_window()
     33         self.web.implicitly_wait(20)
     34         time.sleep(1)
     35 
     36         for i in range(0, 10000, random.randint(100,150)):
     37             js = f"document.querySelector('#zjxw_pc').scrollTo(0, {i})"
     38             self.web.execute_script(js)
     39             time.sleep(random.random())
     40         return self.pageSource()
     41 
     42     def pageSource(self):
     43         return self.web.page_source
     44 
     45 
     46 class Spider(WebDriver):
     47     def parseHomePage(self, htmlSource):
     48         selector = parsel.Selector(htmlSource)
     49         divs = selector.css('div.channelList_article_container__3dui5 div.listItems_article_item__3B3fT')
     50         articleUrls = []
     51         for div in divs:
     52             articleUrl = div.css('a::attr(href)').get()
     53             articleUrls.append(articleUrl)
     54         return articleUrls
     55 
     56     def paseArticlePage(self, articleUrl):
     57         from faker import Faker
     58         headers = {'user-agent': Faker().user_agent()}
     59         resp = requests.get(url=articleUrl, headers=headers)
     60         time.sleep(random.random())
     61 
     62         selectoe = parsel.Selector(resp.text)
     63         try:
     64             json_data = json.loads(selectoe.css('script#__NEXT_DATA__::text').get())
     65             articleData = json_data['props']['pageProps']['article']
     66         except TypeError:
     67             return None
     68 
     69         docTitle = articleData['docTitle']
     70         publishedAt = articleData['publishedAt']
     71         dateObj = datetime.fromtimestamp(int(str(publishedAt)[0:10]))
     72         publishedAt = dateObj.strftime('%Y-%m-%d %H:%M:%S')
     73         content = articleData['content']
     74         source = articleData['source']
     75         author = articleData['author']
     76         if author:
     77             author = f'<p>作者:{author}</p>'
     78         else:
     79             author = ''
     80         webLink = articleData['webLink']
     81         if webLink != '':
     82             video = ''.join(selectoe.css('video').getall())
     83             video = html.unescape(video)
     84             video = f'<div div align="center" style="text-align: left;">{video}</div>'
     85             content = video + content
     86 
     87         date = publishedAt.split(' ', 1)[0]
     88         pub_date = publishedAt + '@' + source
     89         column_url = self.url
     90         head = f'<p class="column_url"><a href="{column_url}">栏目地址:{column_url}</a></p>
    ' 
     91                f'<p class="chapter_url"><a href="{articleUrl}">文章详细地址:{articleUrl}</a></p>
    ' 
     92                f'<p class="Summary"></p>
    ' 
     93                f'<p class="pub_data">{pub_date}</p>
    '
     94         content = f'<div class="content">{author}{content}</div>'
     95         content = head + '
    ' + content
     96         return docTitle, content, date
     97 
     98     def save_content(self, path, title, content, date):
     99         new_date = str(date).replace('/', '-')
    100         timestamp = str(time.time()).split('.')[1]
    101         if title == None:
    102             if content:
    103                 title = timestamp
    104             else:
    105                 return
    106         new_title = ''.join(re.findall(r'([u2E80-u9FFF0-9a-zA-Z“”:?!《》,-]+)', title))
    107         if '.' in new_title:
    108             new_title = new_title.replace('.', '')
    109         if 'BR' in new_title:
    110             new_title = new_title.replace('BR', '')
    111 
    112         path = path.replace('\', '\\')
    113         filedspath = f'{path}\{new_date}\'
    114         if not os.path.exists(filedspath):
    115             os.mkdir(filedspath)
    116         filedsname = filedspath + f'{new_title}.html'
    117         with open(filedsname, mode='w', encoding='utf-8') as f:
    118             f.write('<!DOCTYPE html>
    <html>
    ')
    119             f.write('<head><meta charset="UTF-8"></head>
    ')
    120             f.write('<body>
    ')
    121             f.write(f'<h1 align="center">{title}</h1>
    ')
    122             f.write(content)
    123             f.write('
    </body>
    </html>')
    124 
    125 
    126 class PicSpider(Spider):
    127     def parseHomePage(self, htmlSource):
    128         selector = parsel.Selector(htmlSource)
    129         divs = selector.css('div.channelList_image_article_container__3rM_q div a')
    130         articleUrls = []
    131         for a in divs:
    132             articleUrl = a.css('::attr(href)').get()
    133             articleUrls.append(articleUrl)
    134         return articleUrls
    135 
    136 
    137 # 头条
    138 def runTouTiao(path):
    139     homeUrl = 'https://zj.zjol.com.cn/?id=52e5f902cf81d754a434fb50'
    140     toutiao = Spider(url=homeUrl)
    141     homePageSource = toutiao.run()
    142     articleUrls = toutiao.parseHomePage(htmlSource=homePageSource)
    143 
    144     for articleUrl in articleUrls:
    145         content = toutiao.paseArticlePage(articleUrl)
    146         if content != None:
    147             title, content, date = content
    148             toutiao.save_content(f'{path}\头条', title, content, date)
    149             print(title, date, articleUrl)
    150 
    151 
    152 # 天下
    153 def runTianXia(path):
    154     homeUrl = 'https://zj.zjol.com.cn/?id=5d4ba90a159bb84750661d51'
    155     tianxia = Spider(url=homeUrl)
    156     homePageSource = tianxia.run()
    157     articleUrls = tianxia.parseHomePage(htmlSource=homePageSource)
    158 
    159     for articleUrl in articleUrls:
    160         content = tianxia.paseArticlePage(articleUrl)
    161         if content != None:
    162             title, content, date = content
    163             tianxia.save_content(f'{path}\天下', title, content, date)
    164             print(title, date, articleUrl)
    165 
    166 
    167 # 浙江
    168 def runZheJiang(path):
    169     homeUrl = 'https://zj.zjol.com.cn/?id=5d4ba8cd159bb84750661d50'
    170     zhejiang = Spider(url=homeUrl)
    171     homePageSource = zhejiang.run()
    172     articleUrls = zhejiang.parseHomePage(htmlSource=homePageSource)
    173 
    174     for articleUrl in articleUrls:
    175         content = zhejiang.paseArticlePage(articleUrl)
    176         if content != None:
    177             title, content, date = content
    178             zhejiang.save_content(f'{path}\浙江', title, content, date)
    179             print(title, date, articleUrl)
    180 
    181 
    182 # 战疫
    183 def runZhanYi(path):
    184     homeUrl = 'https://zj.zjol.com.cn/?id=5e2e4410b4a13d092b0dc969'
    185     zhanyi = Spider(url=homeUrl)
    186     homePageSource = zhanyi.run()
    187     articleUrls = zhanyi.parseHomePage(htmlSource=homePageSource)
    188 
    189     for articleUrl in articleUrls:
    190         content = zhanyi.paseArticlePage(articleUrl)
    191         if content != None:
    192             title, content, date = content
    193             zhanyi.save_content(f'{path}\战疫', title, content, date)
    194             print(title, date, articleUrl)
    195 
    196 
    197 # 观点
    198 def runGuanDian(path):
    199     homeUrl = 'https://zj.zjol.com.cn/?id=584e6ac7e200b2098f871d3a'
    200     guandian = Spider(url=homeUrl)
    201     homePageSource = guandian.run()
    202     articleUrls = guandian.parseHomePage(htmlSource=homePageSource)
    203 
    204     for articleUrl in articleUrls:
    205         content = guandian.paseArticlePage(articleUrl)
    206         if content != None:
    207             title, content, date = content
    208             guandian.save_content(f'{path}\观点', title, content, date)
    209             print(title, date, articleUrl)
    210 
    211 
    212 # 生活
    213 def runShengHuo(path):
    214     homeUrl = 'https://zj.zjol.com.cn/?id=5534eb21498e2ca4bf9f3c34'
    215     shenghuo = Spider(url=homeUrl)
    216     homePageSource = shenghuo.run()
    217     articleUrls = shenghuo.parseHomePage(htmlSource=homePageSource)
    218 
    219     for articleUrl in articleUrls:
    220         content = shenghuo.paseArticlePage(articleUrl)
    221         if content != None:
    222             title, content, date = content
    223             shenghuo.save_content(f'{path}\生活', title, content, date)
    224             print(title, date, articleUrl)
    225 
    226 
    227 # 图片
    228 def runTuPian(path):
    229     homeUrl = 'https://zj.zjol.com.cn/image-list'
    230     tupian = PicSpider(url=homeUrl)
    231     homePageSource = tupian.run()
    232     articleUrls = tupian.parseHomePage(htmlSource=homePageSource)
    233 
    234     for articleUrl in articleUrls:
    235         content = tupian.paseArticlePage(articleUrl)
    236         if content != None:
    237             title, content, date = content
    238             tupian.save_content(f'{path}\图片', title, content, date)
    239             print(title, date, articleUrl)
    240 
    241 
    242 # 杭州
    243 def runHangZhou(path):
    244     homeUrl = 'https://zj.zjol.com.cn/local?id=53845624e4b08e9fb1cdfc17'
    245     hangzhou = Spider(url=homeUrl)
    246     homePageSource = hangzhou.run()
    247     articleUrls = hangzhou.parseHomePage(htmlSource=homePageSource)
    248 
    249     for articleUrl in articleUrls:
    250         content = hangzhou.paseArticlePage(articleUrl)
    251         if content != None:
    252             title, content, date = content
    253             hangzhou.save_content(f'{path}\杭州', title, content, date)
    254             print(title, date, articleUrl)
    255 
    256 
    257 # 宁波
    258 def runNingBo(path):
    259     homeUrl = 'https://zj.zjol.com.cn/local?id=53845a6fe4b08e9fb1cdfcac'
    260     ningbo = Spider(url=homeUrl)
    261     homePageSource = ningbo.run()
    262     articleUrls = ningbo.parseHomePage(htmlSource=homePageSource)
    263 
    264     for articleUrl in articleUrls:
    265         content = ningbo.paseArticlePage(articleUrl)
    266         if content != None:
    267             title, content, date = content
    268             ningbo.save_content(f'{path}\宁波', title, content, date)
    269             print(title, date, articleUrl)
    270 
    271 
    272 # 温州
    273 def runWenZhou(path):
    274     homeUrl = 'https://zj.zjol.com.cn/local?id=53845aaee4b08e9fb1cdfcb4'
    275     wenzhou = Spider(url=homeUrl)
    276     homePageSource = wenzhou.run()
    277     articleUrls = wenzhou.parseHomePage(htmlSource=homePageSource)
    278 
    279     for articleUrl in articleUrls:
    280         content = wenzhou.paseArticlePage(articleUrl)
    281         if content != None:
    282             title, content, date = content
    283             wenzhou.save_content(f'{path}\温州', title, content, date)
    284             print(title, date, articleUrl)
    285 
    286 
    287 # 湖州
    288 def runHuZhou(path):
    289     homeUrl = 'https://zj.zjol.com.cn/local?id=53845b49e4b08e9fb1cdfcc1'
    290     huzhou = Spider(url=homeUrl)
    291     homePageSource = huzhou.run()
    292     articleUrls = huzhou.parseHomePage(htmlSource=homePageSource)
    293 
    294     for articleUrl in articleUrls:
    295         content = huzhou.paseArticlePage(articleUrl)
    296         if content != None:
    297             title, content, date = content
    298             huzhou.save_content(f'{path}\湖州', title, content, date)
    299             print(title, date, articleUrl)
    300 
    301 
    302 # 嘉兴
    303 def runJiaXing(path):
    304     homeUrl = 'https://zj.zjol.com.cn/local?id=53845af4e4b08e9fb1cdfcbd'
    305     jiaxing = Spider(url=homeUrl)
    306     homePageSource = jiaxing.run()
    307     articleUrls = jiaxing.parseHomePage(htmlSource=homePageSource)
    308 
    309     for articleUrl in articleUrls:
    310         content = jiaxing.paseArticlePage(articleUrl)
    311         if content != None:
    312             title, content, date = content
    313             jiaxing.save_content(f'{path}\嘉兴', title, content, date)
    314             print(title, date, articleUrl)
    315 
    316 
    317 # 绍兴
    318 def runShaoXing(path):
    319     homeUrl = 'https://zj.zjol.com.cn/local?id=53845b81e4b08e9fb1cdfccf'
    320     shaoxing = Spider(url=homeUrl)
    321     homePageSource = shaoxing.run()
    322     articleUrls = shaoxing.parseHomePage(htmlSource=homePageSource)
    323 
    324     for articleUrl in articleUrls:
    325         content = shaoxing.paseArticlePage(articleUrl)
    326         if content != None:
    327             title, content, date = content
    328             shaoxing.save_content(f'{path}\绍兴', title, content, date)
    329             print(title, date, articleUrl)
    330 
    331 
    332 # 金华
    333 def runJinHua(path):
    334     homeUrl = 'https://zj.zjol.com.cn/local?id=53845bd9e4b08e9fb1cdfcda'
    335     jinhua = Spider(url=homeUrl)
    336     homePageSource = jinhua.run()
    337     articleUrls = jinhua.parseHomePage(htmlSource=homePageSource)
    338 
    339     for articleUrl in articleUrls:
    340         content = jinhua.paseArticlePage(articleUrl)
    341         if content != None:
    342             title, content, date = content
    343             jinhua.save_content(f'{path}\金华', title, content, date)
    344             print(title, date, articleUrl)
    345 
    346 
    347 # 衢州
    348 def runQuZhou(path):
    349     homeUrl = 'https://zj.zjol.com.cn/local?id=53845c2ae4b08e9fb1cdfce3'
    350     quzhou = Spider(url=homeUrl)
    351     homePageSource = quzhou.run()
    352     articleUrls = quzhou.parseHomePage(htmlSource=homePageSource)
    353 
    354     for articleUrl in articleUrls:
    355         content = quzhou.paseArticlePage(articleUrl)
    356         if content != None:
    357             title, content, date = content
    358             quzhou.save_content(f'{path}\衢州', title, content, date)
    359             print(title, date, articleUrl)
    360 
    361 
    362 # 舟山
    363 def runZhouShan(path):
    364     homeUrl = 'https://zj.zjol.com.cn/local?id=53845c65e4b08e9fb1cdfce7'
    365     zhoushan = Spider(url=homeUrl)
    366     homePageSource = zhoushan.run()
    367     articleUrls = zhoushan.parseHomePage(htmlSource=homePageSource)
    368 
    369     for articleUrl in articleUrls:
    370         content = zhoushan.paseArticlePage(articleUrl)
    371         if content != None:
    372             title, content, date = content
    373             zhoushan.save_content(f'{path}\舟山', title, content, date)
    374             print(title, date, articleUrl)
    375 
    376 
    377 # 台州
    378 def runTaiZhou(path):
    379     homeUrl = 'https://zj.zjol.com.cn/local?id=53845c96e4b08e9fb1cdfcec'
    380     taizhou = Spider(url=homeUrl)
    381     homePageSource = taizhou.run()
    382     articleUrls = taizhou.parseHomePage(htmlSource=homePageSource)
    383 
    384     for articleUrl in articleUrls:
    385         content = taizhou.paseArticlePage(articleUrl)
    386         if content != None:
    387             title, content, date = content
    388             taizhou.save_content(f'{path}\台州', title, content, date)
    389             print(title, date, articleUrl)
    390 
    391 
    392 # 丽水  来源
    393 def runLiShui(path):
    394     homeUrl = 'https://zj.zjol.com.cn/local?id=53845cd2e4b08e9fb1cdfcf0'
    395     lishui = Spider(url=homeUrl)
    396     homePageSource = lishui.run()
    397     articleUrls = lishui.parseHomePage(htmlSource=homePageSource)
    398 
    399     for articleUrl in articleUrls:
    400         content = lishui.paseArticlePage(articleUrl)
    401         if content != None:
    402             title, content, date = content
    403             lishui.save_content(f'{path}\丽水', title, content, date)
    404             print(title, date, articleUrl)
    405 
    406 
    407 # 义乌
    408 def runYiWu(path):
    409     homeUrl = 'https://zj.zjol.com.cn/local?id=5428f31b498e0d3c0109194e'
    410     tianxia = Spider(url=homeUrl)
    411     homePageSource = tianxia.run()
    412     articleUrls = tianxia.parseHomePage(htmlSource=homePageSource)
    413 
    414     for articleUrl in articleUrls:
    415         content = tianxia.paseArticlePage(articleUrl)
    416         if content != None:
    417             title, content, date = content
    418             tianxia.save_content(f'{path}\义乌', title, content, date)
    419             print(title, date, articleUrl)
    420 
    421 
    422 # 视频
    423 def runShiPin(path):
    424     homeUrl = 'https://zj.zjol.com.cn/?id=57d690e7e200b20fbb4af09f'
    425     shipin = Spider(url=homeUrl)
    426     homePageSource = shipin.run()
    427     articleUrls = shipin.parseHomePage(htmlSource=homePageSource)
    428 
    429     for articleUrl in articleUrls:
    430         content = shipin.paseArticlePage(articleUrl)
    431         if content != None:
    432             title, content, date = content
    433             shipin.save_content(f'{path}\视频', title, content, date)
    434             print(title, date, articleUrl)
    435 
    436 
    437 def runAll():
    438     column_list = [runTouTiao, runTianXia, runZheJiang, runZhanYi, runGuanDian, runShengHuo, runTuPian, runHangZhou,
    439                    runNingBo, runWenZhou, runHuZhou, runJiaXing, runShaoXing, runJinHua, runQuZhou, runZhouShan,
    440                    runTaiZhou, runLiShui, runYiWu, runShiPin]
    441 
    442     with ThreadPoolExecutor(max_workers=5) as t:
    443         for column in column_list:
    444             t.submit(column, 'E:\data\zjxww')
    445 
    446     # for i in range(10):
    447     #     time.sleep(1)
    448     #     print(i)
    449 
    450 
    451 def start(minutes=0):
    452     flg = minutes
    453     schedule.every(minutes).minutes.do(runAll)
    454     while True:
    455         if minutes != 0:
    456             print(f'等待{minutes}分钟')
    457         else:
    458             minutes = flg
    459         schedule.run_pending()
    460 
    461         
    462         time.sleep(60)
    463         minutes = minutes - 1
    464 
    465 
    466 def exists():
    467     column_zh_list = ['头条', '天下', '浙江', '战疫', '观点', '生活', '图片', '杭州', '宁波', '温州',
    468                       '湖州', '嘉兴', '绍兴', '金华', '衢州', '舟山', '台州', '丽水', '义乌', '视频', ]
    469 
    470     if not os.path.exists('E:\data\zjxww'):
    471         os.mkdir('E:\data\zjxww')
    472         for i in column_zh_list:
    473             filedspath = f'E:\data\zjxww\{i}'
    474             os.mkdir(filedspath)
    475     else:
    476         for i in column_zh_list:
    477             filedspath = f'E:\data\zjxww\{i}'
    478             if not os.path.exists(filedspath):
    479                 os.mkdir(filedspath)
    480 
    481 
    482 if __name__ == "__main__":
    483     if len(sys.argv) == 2:
    484         exists()
    485         minutes = int(sys.argv[1])
    486         start(minutes)
    487     if len(sys.argv) == 3:
    488         exists()
    489         flg = int(sys.argv[2])
    490         if flg == 1:
    491             runAll()
    492         minutes = int(sys.argv[1])
    493         start(minutes)
    494 
    495     # runHuZhou('E:\data\zjxww')
  • 相关阅读:
    react脚手架
    快速创建一个node后台管理系统
    vue脚手架结构及vue-router路由配置
    Spring 事务管理-只记录xml部分
    Spring-aspectJ
    Maven 自定义Maven插件
    JVM
    JVM
    Spring
    Digester
  • 原文地址:https://www.cnblogs.com/lixueren-wy/p/15246350.html
Copyright © 2011-2022 走看看