zoukankan html css js c++ java
selenium-浙江新闻网

  1 # coding: utf-8
  2 import html
  3 import json
  4 import os
  5 import re
  6 import sys
  7 from datetime import datetime
  8 from pprint import pprint
  9 
 10 import parsel
 11 import requests
 12 import schedule
 13 from selenium.webdriver import Chrome
 14 from selenium.webdriver.chrome.options import Options
 15 import random
 16 import time
 17 import threading
 18 from concurrent.futures import ThreadPoolExecutor
 19 
 20 
 21 class WebDriver():
 22     def __init__(self, url):
 23         self.url = url
 24         self.option = Options()
 25         self.option.add_argument('--disable-blink-features=AutomationControlled')
 26         self.option.add_argument('headless')
 27         self.option.add_argument('--disable-gpu')
 28         self.web = Chrome(options=self.option)
 29 
 30     def run(self):
 31         self.web.get(url=self.url)
 32         self.web.maximize_window()
 33         self.web.implicitly_wait(20)
 34         time.sleep(1)
 35 
 36         for i in range(0, 10000, random.randint(100,150)):
 37             js = f"document.querySelector('#zjxw_pc').scrollTo(0, {i})"
 38             self.web.execute_script(js)
 39             time.sleep(random.random())
 40         return self.pageSource()
 41 
 42     def pageSource(self):
 43         return self.web.page_source
 44 
 45 
 46 class Spider(WebDriver):
 47     def parseHomePage(self, htmlSource):
 48         selector = parsel.Selector(htmlSource)
 49         divs = selector.css('div.channelList_article_container__3dui5 div.listItems_article_item__3B3fT')
 50         articleUrls = []
 51         for div in divs:
 52             articleUrl = div.css('a::attr(href)').get()
 53             articleUrls.append(articleUrl)
 54         return articleUrls
 55 
 56     def paseArticlePage(self, articleUrl):
 57         from faker import Faker
 58         headers = {'user-agent': Faker().user_agent()}
 59         resp = requests.get(url=articleUrl, headers=headers)
 60         time.sleep(random.random())
 61 
 62         selectoe = parsel.Selector(resp.text)
 63         try:
 64             json_data = json.loads(selectoe.css('script#__NEXT_DATA__::text').get())
 65             articleData = json_data['props']['pageProps']['article']
 66         except TypeError:
 67             return None
 68 
 69         docTitle = articleData['docTitle']
 70         publishedAt = articleData['publishedAt']
 71         dateObj = datetime.fromtimestamp(int(str(publishedAt)[0:10]))
 72         publishedAt = dateObj.strftime('%Y-%m-%d %H:%M:%S')
 73         content = articleData['content']
 74         source = articleData['source']
 75         author = articleData['author']
 76         if author:
 77             author = f'<p>作者：{author}</p>'
 78         else:
 79             author = ''
 80         webLink = articleData['webLink']
 81         if webLink != '':
 82             video = ''.join(selectoe.css('video').getall())
 83             video = html.unescape(video)
 84             video = f'<div div align="center" style="text-align: left;">{video}</div>'
 85             content = video + content
 86 
 87         date = publishedAt.split(' ', 1)[0]
 88         pub_date = publishedAt + '@' + source
 89         column_url = self.url
 90         head = f'<p class="column_url"><a href="{column_url}">栏目地址：{column_url}</a></p>
' 
 91                f'<p class="chapter_url"><a href="{articleUrl}">文章详细地址：{articleUrl}</a></p>
' 
 92                f'<p class="Summary"></p>
' 
 93                f'<p class="pub_data">{pub_date}</p>
'
 94         content = f'<div class="content">{author}{content}</div>'
 95         content = head + '
' + content
 96         return docTitle, content, date
 97 
 98     def save_content(self, path, title, content, date):
 99         new_date = str(date).replace('/', '-')
100         timestamp = str(time.time()).split('.')[1]
101         if title == None:
102             if content:
103                 title = timestamp
104             else:
105                 return
106         new_title = ''.join(re.findall(r'([u2E80-u9FFF0-9a-zA-Z“”：？！《》，-]+)', title))
107         if '.' in new_title:
108             new_title = new_title.replace('.', '点')
109         if 'BR' in new_title:
110             new_title = new_title.replace('BR', '')
111 
112         path = path.replace('\', '\\')
113         filedspath = f'{path}\{new_date}\'
114         if not os.path.exists(filedspath):
115             os.mkdir(filedspath)
116         filedsname = filedspath + f'{new_title}.html'
117         with open(filedsname, mode='w', encoding='utf-8') as f:
118             f.write('<!DOCTYPE html>
<html>
')
119             f.write('<head><meta charset="UTF-8"></head>
')
120             f.write('<body>
')
121             f.write(f'<h1 align="center">{title}</h1>
')
122             f.write(content)
123             f.write('
</body>
</html>')
124 
125 
126 class PicSpider(Spider):
127     def parseHomePage(self, htmlSource):
128         selector = parsel.Selector(htmlSource)
129         divs = selector.css('div.channelList_image_article_container__3rM_q div a')
130         articleUrls = []
131         for a in divs:
132             articleUrl = a.css('::attr(href)').get()
133             articleUrls.append(articleUrl)
134         return articleUrls
135 
136 
137 # 头条
138 def runTouTiao(path):
139     homeUrl = 'https://zj.zjol.com.cn/?id=52e5f902cf81d754a434fb50'
140     toutiao = Spider(url=homeUrl)
141     homePageSource = toutiao.run()
142     articleUrls = toutiao.parseHomePage(htmlSource=homePageSource)
143 
144     for articleUrl in articleUrls:
145         content = toutiao.paseArticlePage(articleUrl)
146         if content != None:
147             title, content, date = content
148             toutiao.save_content(f'{path}\头条', title, content, date)
149             print(title, date, articleUrl)
150 
151 
152 # 天下
153 def runTianXia(path):
154     homeUrl = 'https://zj.zjol.com.cn/?id=5d4ba90a159bb84750661d51'
155     tianxia = Spider(url=homeUrl)
156     homePageSource = tianxia.run()
157     articleUrls = tianxia.parseHomePage(htmlSource=homePageSource)
158 
159     for articleUrl in articleUrls:
160         content = tianxia.paseArticlePage(articleUrl)
161         if content != None:
162             title, content, date = content
163             tianxia.save_content(f'{path}\天下', title, content, date)
164             print(title, date, articleUrl)
165 
166 
167 # 浙江
168 def runZheJiang(path):
169     homeUrl = 'https://zj.zjol.com.cn/?id=5d4ba8cd159bb84750661d50'
170     zhejiang = Spider(url=homeUrl)
171     homePageSource = zhejiang.run()
172     articleUrls = zhejiang.parseHomePage(htmlSource=homePageSource)
173 
174     for articleUrl in articleUrls:
175         content = zhejiang.paseArticlePage(articleUrl)
176         if content != None:
177             title, content, date = content
178             zhejiang.save_content(f'{path}\浙江', title, content, date)
179             print(title, date, articleUrl)
180 
181 
182 # 战疫
183 def runZhanYi(path):
184     homeUrl = 'https://zj.zjol.com.cn/?id=5e2e4410b4a13d092b0dc969'
185     zhanyi = Spider(url=homeUrl)
186     homePageSource = zhanyi.run()
187     articleUrls = zhanyi.parseHomePage(htmlSource=homePageSource)
188 
189     for articleUrl in articleUrls:
190         content = zhanyi.paseArticlePage(articleUrl)
191         if content != None:
192             title, content, date = content
193             zhanyi.save_content(f'{path}\战疫', title, content, date)
194             print(title, date, articleUrl)
195 
196 
197 # 观点
198 def runGuanDian(path):
199     homeUrl = 'https://zj.zjol.com.cn/?id=584e6ac7e200b2098f871d3a'
200     guandian = Spider(url=homeUrl)
201     homePageSource = guandian.run()
202     articleUrls = guandian.parseHomePage(htmlSource=homePageSource)
203 
204     for articleUrl in articleUrls:
205         content = guandian.paseArticlePage(articleUrl)
206         if content != None:
207             title, content, date = content
208             guandian.save_content(f'{path}\观点', title, content, date)
209             print(title, date, articleUrl)
210 
211 
212 # 生活
213 def runShengHuo(path):
214     homeUrl = 'https://zj.zjol.com.cn/?id=5534eb21498e2ca4bf9f3c34'
215     shenghuo = Spider(url=homeUrl)
216     homePageSource = shenghuo.run()
217     articleUrls = shenghuo.parseHomePage(htmlSource=homePageSource)
218 
219     for articleUrl in articleUrls:
220         content = shenghuo.paseArticlePage(articleUrl)
221         if content != None:
222             title, content, date = content
223             shenghuo.save_content(f'{path}\生活', title, content, date)
224             print(title, date, articleUrl)
225 
226 
227 # 图片
228 def runTuPian(path):
229     homeUrl = 'https://zj.zjol.com.cn/image-list'
230     tupian = PicSpider(url=homeUrl)
231     homePageSource = tupian.run()
232     articleUrls = tupian.parseHomePage(htmlSource=homePageSource)
233 
234     for articleUrl in articleUrls:
235         content = tupian.paseArticlePage(articleUrl)
236         if content != None:
237             title, content, date = content
238             tupian.save_content(f'{path}\图片', title, content, date)
239             print(title, date, articleUrl)
240 
241 
242 # 杭州
243 def runHangZhou(path):
244     homeUrl = 'https://zj.zjol.com.cn/local?id=53845624e4b08e9fb1cdfc17'
245     hangzhou = Spider(url=homeUrl)
246     homePageSource = hangzhou.run()
247     articleUrls = hangzhou.parseHomePage(htmlSource=homePageSource)
248 
249     for articleUrl in articleUrls:
250         content = hangzhou.paseArticlePage(articleUrl)
251         if content != None:
252             title, content, date = content
253             hangzhou.save_content(f'{path}\杭州', title, content, date)
254             print(title, date, articleUrl)
255 
256 
257 # 宁波
258 def runNingBo(path):
259     homeUrl = 'https://zj.zjol.com.cn/local?id=53845a6fe4b08e9fb1cdfcac'
260     ningbo = Spider(url=homeUrl)
261     homePageSource = ningbo.run()
262     articleUrls = ningbo.parseHomePage(htmlSource=homePageSource)
263 
264     for articleUrl in articleUrls:
265         content = ningbo.paseArticlePage(articleUrl)
266         if content != None:
267             title, content, date = content
268             ningbo.save_content(f'{path}\宁波', title, content, date)
269             print(title, date, articleUrl)
270 
271 
272 # 温州
273 def runWenZhou(path):
274     homeUrl = 'https://zj.zjol.com.cn/local?id=53845aaee4b08e9fb1cdfcb4'
275     wenzhou = Spider(url=homeUrl)
276     homePageSource = wenzhou.run()
277     articleUrls = wenzhou.parseHomePage(htmlSource=homePageSource)
278 
279     for articleUrl in articleUrls:
280         content = wenzhou.paseArticlePage(articleUrl)
281         if content != None:
282             title, content, date = content
283             wenzhou.save_content(f'{path}\温州', title, content, date)
284             print(title, date, articleUrl)
285 
286 
287 # 湖州
288 def runHuZhou(path):
289     homeUrl = 'https://zj.zjol.com.cn/local?id=53845b49e4b08e9fb1cdfcc1'
290     huzhou = Spider(url=homeUrl)
291     homePageSource = huzhou.run()
292     articleUrls = huzhou.parseHomePage(htmlSource=homePageSource)
293 
294     for articleUrl in articleUrls:
295         content = huzhou.paseArticlePage(articleUrl)
296         if content != None:
297             title, content, date = content
298             huzhou.save_content(f'{path}\湖州', title, content, date)
299             print(title, date, articleUrl)
300 
301 
302 # 嘉兴
303 def runJiaXing(path):
304     homeUrl = 'https://zj.zjol.com.cn/local?id=53845af4e4b08e9fb1cdfcbd'
305     jiaxing = Spider(url=homeUrl)
306     homePageSource = jiaxing.run()
307     articleUrls = jiaxing.parseHomePage(htmlSource=homePageSource)
308 
309     for articleUrl in articleUrls:
310         content = jiaxing.paseArticlePage(articleUrl)
311         if content != None:
312             title, content, date = content
313             jiaxing.save_content(f'{path}\嘉兴', title, content, date)
314             print(title, date, articleUrl)
315 
316 
317 # 绍兴
318 def runShaoXing(path):
319     homeUrl = 'https://zj.zjol.com.cn/local?id=53845b81e4b08e9fb1cdfccf'
320     shaoxing = Spider(url=homeUrl)
321     homePageSource = shaoxing.run()
322     articleUrls = shaoxing.parseHomePage(htmlSource=homePageSource)
323 
324     for articleUrl in articleUrls:
325         content = shaoxing.paseArticlePage(articleUrl)
326         if content != None:
327             title, content, date = content
328             shaoxing.save_content(f'{path}\绍兴', title, content, date)
329             print(title, date, articleUrl)
330 
331 
332 # 金华
333 def runJinHua(path):
334     homeUrl = 'https://zj.zjol.com.cn/local?id=53845bd9e4b08e9fb1cdfcda'
335     jinhua = Spider(url=homeUrl)
336     homePageSource = jinhua.run()
337     articleUrls = jinhua.parseHomePage(htmlSource=homePageSource)
338 
339     for articleUrl in articleUrls:
340         content = jinhua.paseArticlePage(articleUrl)
341         if content != None:
342             title, content, date = content
343             jinhua.save_content(f'{path}\金华', title, content, date)
344             print(title, date, articleUrl)
345 
346 
347 # 衢州
348 def runQuZhou(path):
349     homeUrl = 'https://zj.zjol.com.cn/local?id=53845c2ae4b08e9fb1cdfce3'
350     quzhou = Spider(url=homeUrl)
351     homePageSource = quzhou.run()
352     articleUrls = quzhou.parseHomePage(htmlSource=homePageSource)
353 
354     for articleUrl in articleUrls:
355         content = quzhou.paseArticlePage(articleUrl)
356         if content != None:
357             title, content, date = content
358             quzhou.save_content(f'{path}\衢州', title, content, date)
359             print(title, date, articleUrl)
360 
361 
362 # 舟山
363 def runZhouShan(path):
364     homeUrl = 'https://zj.zjol.com.cn/local?id=53845c65e4b08e9fb1cdfce7'
365     zhoushan = Spider(url=homeUrl)
366     homePageSource = zhoushan.run()
367     articleUrls = zhoushan.parseHomePage(htmlSource=homePageSource)
368 
369     for articleUrl in articleUrls:
370         content = zhoushan.paseArticlePage(articleUrl)
371         if content != None:
372             title, content, date = content
373             zhoushan.save_content(f'{path}\舟山', title, content, date)
374             print(title, date, articleUrl)
375 
376 
377 # 台州
378 def runTaiZhou(path):
379     homeUrl = 'https://zj.zjol.com.cn/local?id=53845c96e4b08e9fb1cdfcec'
380     taizhou = Spider(url=homeUrl)
381     homePageSource = taizhou.run()
382     articleUrls = taizhou.parseHomePage(htmlSource=homePageSource)
383 
384     for articleUrl in articleUrls:
385         content = taizhou.paseArticlePage(articleUrl)
386         if content != None:
387             title, content, date = content
388             taizhou.save_content(f'{path}\台州', title, content, date)
389             print(title, date, articleUrl)
390 
391 
392 # 丽水  来源
393 def runLiShui(path):
394     homeUrl = 'https://zj.zjol.com.cn/local?id=53845cd2e4b08e9fb1cdfcf0'
395     lishui = Spider(url=homeUrl)
396     homePageSource = lishui.run()
397     articleUrls = lishui.parseHomePage(htmlSource=homePageSource)
398 
399     for articleUrl in articleUrls:
400         content = lishui.paseArticlePage(articleUrl)
401         if content != None:
402             title, content, date = content
403             lishui.save_content(f'{path}\丽水', title, content, date)
404             print(title, date, articleUrl)
405 
406 
407 # 义乌
408 def runYiWu(path):
409     homeUrl = 'https://zj.zjol.com.cn/local?id=5428f31b498e0d3c0109194e'
410     tianxia = Spider(url=homeUrl)
411     homePageSource = tianxia.run()
412     articleUrls = tianxia.parseHomePage(htmlSource=homePageSource)
413 
414     for articleUrl in articleUrls:
415         content = tianxia.paseArticlePage(articleUrl)
416         if content != None:
417             title, content, date = content
418             tianxia.save_content(f'{path}\义乌', title, content, date)
419             print(title, date, articleUrl)
420 
421 
422 # 视频
423 def runShiPin(path):
424     homeUrl = 'https://zj.zjol.com.cn/?id=57d690e7e200b20fbb4af09f'
425     shipin = Spider(url=homeUrl)
426     homePageSource = shipin.run()
427     articleUrls = shipin.parseHomePage(htmlSource=homePageSource)
428 
429     for articleUrl in articleUrls:
430         content = shipin.paseArticlePage(articleUrl)
431         if content != None:
432             title, content, date = content
433             shipin.save_content(f'{path}\视频', title, content, date)
434             print(title, date, articleUrl)
435 
436 
437 def runAll():
438     column_list = [runTouTiao, runTianXia, runZheJiang, runZhanYi, runGuanDian, runShengHuo, runTuPian, runHangZhou,
439                    runNingBo, runWenZhou, runHuZhou, runJiaXing, runShaoXing, runJinHua, runQuZhou, runZhouShan,
440                    runTaiZhou, runLiShui, runYiWu, runShiPin]
441 
442     with ThreadPoolExecutor(max_workers=5) as t:
443         for column in column_list:
444             t.submit(column, 'E:\data\zjxww')
445 
446     # for i in range(10):
447     #     time.sleep(1)
448     #     print(i)
449 
450 
451 def start(minutes=0):
452     flg = minutes
453     schedule.every(minutes).minutes.do(runAll)
454     while True:
455         if minutes != 0:
456             print(f'等待{minutes}分钟')
457         else:
458             minutes = flg
459         schedule.run_pending()
460 
461         
462         time.sleep(60)
463         minutes = minutes - 1
464 
465 
466 def exists():
467     column_zh_list = ['头条', '天下', '浙江', '战疫', '观点', '生活', '图片', '杭州', '宁波', '温州',
468                       '湖州', '嘉兴', '绍兴', '金华', '衢州', '舟山', '台州', '丽水', '义乌', '视频', ]
469 
470     if not os.path.exists('E:\data\zjxww'):
471         os.mkdir('E:\data\zjxww')
472         for i in column_zh_list:
473             filedspath = f'E:\data\zjxww\{i}'
474             os.mkdir(filedspath)
475     else:
476         for i in column_zh_list:
477             filedspath = f'E:\data\zjxww\{i}'
478             if not os.path.exists(filedspath):
479                 os.mkdir(filedspath)
480 
481 
482 if __name__ == "__main__":
483     if len(sys.argv) == 2:
484         exists()
485         minutes = int(sys.argv[1])
486         start(minutes)
487     if len(sys.argv) == 3:
488         exists()
489         flg = int(sys.argv[2])
490         if flg == 1:
491             runAll()
492         minutes = int(sys.argv[1])
493         start(minutes)
494 
495     # runHuZhou('E:\data\zjxww')
查看全文
相关阅读:
作业十一
 作业十
 作业九
 作业八
 作业七
 作业六
 作业五
 作业四
 eclipse+maven+web服务，实现对hdfs的目录浏览展示
 Eclipse+hadoop伪态式分布+API
原文地址：https://www.cnblogs.com/lixueren-wy/p/15246350.html