1 # coding: utf-8 2 import html 3 import json 4 import os 5 import re 6 import sys 7 from datetime import datetime 8 from pprint import pprint 9 10 import parsel 11 import requests 12 import schedule 13 from selenium.webdriver import Chrome 14 from selenium.webdriver.chrome.options import Options 15 import random 16 import time 17 import threading 18 from concurrent.futures import ThreadPoolExecutor 19 20 21 class WebDriver(): 22 def __init__(self, url): 23 self.url = url 24 self.option = Options() 25 self.option.add_argument('--disable-blink-features=AutomationControlled') 26 self.option.add_argument('headless') 27 self.option.add_argument('--disable-gpu') 28 self.web = Chrome(options=self.option) 29 30 def run(self): 31 self.web.get(url=self.url) 32 self.web.maximize_window() 33 self.web.implicitly_wait(20) 34 time.sleep(1) 35 36 for i in range(0, 10000, random.randint(100,150)): 37 js = f"document.querySelector('#zjxw_pc').scrollTo(0, {i})" 38 self.web.execute_script(js) 39 time.sleep(random.random()) 40 return self.pageSource() 41 42 def pageSource(self): 43 return self.web.page_source 44 45 46 class Spider(WebDriver): 47 def parseHomePage(self, htmlSource): 48 selector = parsel.Selector(htmlSource) 49 divs = selector.css('div.channelList_article_container__3dui5 div.listItems_article_item__3B3fT') 50 articleUrls = [] 51 for div in divs: 52 articleUrl = div.css('a::attr(href)').get() 53 articleUrls.append(articleUrl) 54 return articleUrls 55 56 def paseArticlePage(self, articleUrl): 57 from faker import Faker 58 headers = {'user-agent': Faker().user_agent()} 59 resp = requests.get(url=articleUrl, headers=headers) 60 time.sleep(random.random()) 61 62 selectoe = parsel.Selector(resp.text) 63 try: 64 json_data = json.loads(selectoe.css('script#__NEXT_DATA__::text').get()) 65 articleData = json_data['props']['pageProps']['article'] 66 except TypeError: 67 return None 68 69 docTitle = articleData['docTitle'] 70 publishedAt = articleData['publishedAt'] 71 dateObj = datetime.fromtimestamp(int(str(publishedAt)[0:10])) 72 publishedAt = dateObj.strftime('%Y-%m-%d %H:%M:%S') 73 content = articleData['content'] 74 source = articleData['source'] 75 author = articleData['author'] 76 if author: 77 author = f'<p>作者:{author}</p>' 78 else: 79 author = '' 80 webLink = articleData['webLink'] 81 if webLink != '': 82 video = ''.join(selectoe.css('video').getall()) 83 video = html.unescape(video) 84 video = f'<div div align="center" style="text-align: left;">{video}</div>' 85 content = video + content 86 87 date = publishedAt.split(' ', 1)[0] 88 pub_date = publishedAt + '@' + source 89 column_url = self.url 90 head = f'<p class="column_url"><a href="{column_url}">栏目地址:{column_url}</a></p> ' 91 f'<p class="chapter_url"><a href="{articleUrl}">文章详细地址:{articleUrl}</a></p> ' 92 f'<p class="Summary"></p> ' 93 f'<p class="pub_data">{pub_date}</p> ' 94 content = f'<div class="content">{author}{content}</div>' 95 content = head + ' ' + content 96 return docTitle, content, date 97 98 def save_content(self, path, title, content, date): 99 new_date = str(date).replace('/', '-') 100 timestamp = str(time.time()).split('.')[1] 101 if title == None: 102 if content: 103 title = timestamp 104 else: 105 return 106 new_title = ''.join(re.findall(r'([u2E80-u9FFF0-9a-zA-Z“”:?!《》,-]+)', title)) 107 if '.' in new_title: 108 new_title = new_title.replace('.', '点') 109 if 'BR' in new_title: 110 new_title = new_title.replace('BR', '') 111 112 path = path.replace('\', '\\') 113 filedspath = f'{path}\{new_date}\' 114 if not os.path.exists(filedspath): 115 os.mkdir(filedspath) 116 filedsname = filedspath + f'{new_title}.html' 117 with open(filedsname, mode='w', encoding='utf-8') as f: 118 f.write('<!DOCTYPE html> <html> ') 119 f.write('<head><meta charset="UTF-8"></head> ') 120 f.write('<body> ') 121 f.write(f'<h1 align="center">{title}</h1> ') 122 f.write(content) 123 f.write(' </body> </html>') 124 125 126 class PicSpider(Spider): 127 def parseHomePage(self, htmlSource): 128 selector = parsel.Selector(htmlSource) 129 divs = selector.css('div.channelList_image_article_container__3rM_q div a') 130 articleUrls = [] 131 for a in divs: 132 articleUrl = a.css('::attr(href)').get() 133 articleUrls.append(articleUrl) 134 return articleUrls 135 136 137 # 头条 138 def runTouTiao(path): 139 homeUrl = 'https://zj.zjol.com.cn/?id=52e5f902cf81d754a434fb50' 140 toutiao = Spider(url=homeUrl) 141 homePageSource = toutiao.run() 142 articleUrls = toutiao.parseHomePage(htmlSource=homePageSource) 143 144 for articleUrl in articleUrls: 145 content = toutiao.paseArticlePage(articleUrl) 146 if content != None: 147 title, content, date = content 148 toutiao.save_content(f'{path}\头条', title, content, date) 149 print(title, date, articleUrl) 150 151 152 # 天下 153 def runTianXia(path): 154 homeUrl = 'https://zj.zjol.com.cn/?id=5d4ba90a159bb84750661d51' 155 tianxia = Spider(url=homeUrl) 156 homePageSource = tianxia.run() 157 articleUrls = tianxia.parseHomePage(htmlSource=homePageSource) 158 159 for articleUrl in articleUrls: 160 content = tianxia.paseArticlePage(articleUrl) 161 if content != None: 162 title, content, date = content 163 tianxia.save_content(f'{path}\天下', title, content, date) 164 print(title, date, articleUrl) 165 166 167 # 浙江 168 def runZheJiang(path): 169 homeUrl = 'https://zj.zjol.com.cn/?id=5d4ba8cd159bb84750661d50' 170 zhejiang = Spider(url=homeUrl) 171 homePageSource = zhejiang.run() 172 articleUrls = zhejiang.parseHomePage(htmlSource=homePageSource) 173 174 for articleUrl in articleUrls: 175 content = zhejiang.paseArticlePage(articleUrl) 176 if content != None: 177 title, content, date = content 178 zhejiang.save_content(f'{path}\浙江', title, content, date) 179 print(title, date, articleUrl) 180 181 182 # 战疫 183 def runZhanYi(path): 184 homeUrl = 'https://zj.zjol.com.cn/?id=5e2e4410b4a13d092b0dc969' 185 zhanyi = Spider(url=homeUrl) 186 homePageSource = zhanyi.run() 187 articleUrls = zhanyi.parseHomePage(htmlSource=homePageSource) 188 189 for articleUrl in articleUrls: 190 content = zhanyi.paseArticlePage(articleUrl) 191 if content != None: 192 title, content, date = content 193 zhanyi.save_content(f'{path}\战疫', title, content, date) 194 print(title, date, articleUrl) 195 196 197 # 观点 198 def runGuanDian(path): 199 homeUrl = 'https://zj.zjol.com.cn/?id=584e6ac7e200b2098f871d3a' 200 guandian = Spider(url=homeUrl) 201 homePageSource = guandian.run() 202 articleUrls = guandian.parseHomePage(htmlSource=homePageSource) 203 204 for articleUrl in articleUrls: 205 content = guandian.paseArticlePage(articleUrl) 206 if content != None: 207 title, content, date = content 208 guandian.save_content(f'{path}\观点', title, content, date) 209 print(title, date, articleUrl) 210 211 212 # 生活 213 def runShengHuo(path): 214 homeUrl = 'https://zj.zjol.com.cn/?id=5534eb21498e2ca4bf9f3c34' 215 shenghuo = Spider(url=homeUrl) 216 homePageSource = shenghuo.run() 217 articleUrls = shenghuo.parseHomePage(htmlSource=homePageSource) 218 219 for articleUrl in articleUrls: 220 content = shenghuo.paseArticlePage(articleUrl) 221 if content != None: 222 title, content, date = content 223 shenghuo.save_content(f'{path}\生活', title, content, date) 224 print(title, date, articleUrl) 225 226 227 # 图片 228 def runTuPian(path): 229 homeUrl = 'https://zj.zjol.com.cn/image-list' 230 tupian = PicSpider(url=homeUrl) 231 homePageSource = tupian.run() 232 articleUrls = tupian.parseHomePage(htmlSource=homePageSource) 233 234 for articleUrl in articleUrls: 235 content = tupian.paseArticlePage(articleUrl) 236 if content != None: 237 title, content, date = content 238 tupian.save_content(f'{path}\图片', title, content, date) 239 print(title, date, articleUrl) 240 241 242 # 杭州 243 def runHangZhou(path): 244 homeUrl = 'https://zj.zjol.com.cn/local?id=53845624e4b08e9fb1cdfc17' 245 hangzhou = Spider(url=homeUrl) 246 homePageSource = hangzhou.run() 247 articleUrls = hangzhou.parseHomePage(htmlSource=homePageSource) 248 249 for articleUrl in articleUrls: 250 content = hangzhou.paseArticlePage(articleUrl) 251 if content != None: 252 title, content, date = content 253 hangzhou.save_content(f'{path}\杭州', title, content, date) 254 print(title, date, articleUrl) 255 256 257 # 宁波 258 def runNingBo(path): 259 homeUrl = 'https://zj.zjol.com.cn/local?id=53845a6fe4b08e9fb1cdfcac' 260 ningbo = Spider(url=homeUrl) 261 homePageSource = ningbo.run() 262 articleUrls = ningbo.parseHomePage(htmlSource=homePageSource) 263 264 for articleUrl in articleUrls: 265 content = ningbo.paseArticlePage(articleUrl) 266 if content != None: 267 title, content, date = content 268 ningbo.save_content(f'{path}\宁波', title, content, date) 269 print(title, date, articleUrl) 270 271 272 # 温州 273 def runWenZhou(path): 274 homeUrl = 'https://zj.zjol.com.cn/local?id=53845aaee4b08e9fb1cdfcb4' 275 wenzhou = Spider(url=homeUrl) 276 homePageSource = wenzhou.run() 277 articleUrls = wenzhou.parseHomePage(htmlSource=homePageSource) 278 279 for articleUrl in articleUrls: 280 content = wenzhou.paseArticlePage(articleUrl) 281 if content != None: 282 title, content, date = content 283 wenzhou.save_content(f'{path}\温州', title, content, date) 284 print(title, date, articleUrl) 285 286 287 # 湖州 288 def runHuZhou(path): 289 homeUrl = 'https://zj.zjol.com.cn/local?id=53845b49e4b08e9fb1cdfcc1' 290 huzhou = Spider(url=homeUrl) 291 homePageSource = huzhou.run() 292 articleUrls = huzhou.parseHomePage(htmlSource=homePageSource) 293 294 for articleUrl in articleUrls: 295 content = huzhou.paseArticlePage(articleUrl) 296 if content != None: 297 title, content, date = content 298 huzhou.save_content(f'{path}\湖州', title, content, date) 299 print(title, date, articleUrl) 300 301 302 # 嘉兴 303 def runJiaXing(path): 304 homeUrl = 'https://zj.zjol.com.cn/local?id=53845af4e4b08e9fb1cdfcbd' 305 jiaxing = Spider(url=homeUrl) 306 homePageSource = jiaxing.run() 307 articleUrls = jiaxing.parseHomePage(htmlSource=homePageSource) 308 309 for articleUrl in articleUrls: 310 content = jiaxing.paseArticlePage(articleUrl) 311 if content != None: 312 title, content, date = content 313 jiaxing.save_content(f'{path}\嘉兴', title, content, date) 314 print(title, date, articleUrl) 315 316 317 # 绍兴 318 def runShaoXing(path): 319 homeUrl = 'https://zj.zjol.com.cn/local?id=53845b81e4b08e9fb1cdfccf' 320 shaoxing = Spider(url=homeUrl) 321 homePageSource = shaoxing.run() 322 articleUrls = shaoxing.parseHomePage(htmlSource=homePageSource) 323 324 for articleUrl in articleUrls: 325 content = shaoxing.paseArticlePage(articleUrl) 326 if content != None: 327 title, content, date = content 328 shaoxing.save_content(f'{path}\绍兴', title, content, date) 329 print(title, date, articleUrl) 330 331 332 # 金华 333 def runJinHua(path): 334 homeUrl = 'https://zj.zjol.com.cn/local?id=53845bd9e4b08e9fb1cdfcda' 335 jinhua = Spider(url=homeUrl) 336 homePageSource = jinhua.run() 337 articleUrls = jinhua.parseHomePage(htmlSource=homePageSource) 338 339 for articleUrl in articleUrls: 340 content = jinhua.paseArticlePage(articleUrl) 341 if content != None: 342 title, content, date = content 343 jinhua.save_content(f'{path}\金华', title, content, date) 344 print(title, date, articleUrl) 345 346 347 # 衢州 348 def runQuZhou(path): 349 homeUrl = 'https://zj.zjol.com.cn/local?id=53845c2ae4b08e9fb1cdfce3' 350 quzhou = Spider(url=homeUrl) 351 homePageSource = quzhou.run() 352 articleUrls = quzhou.parseHomePage(htmlSource=homePageSource) 353 354 for articleUrl in articleUrls: 355 content = quzhou.paseArticlePage(articleUrl) 356 if content != None: 357 title, content, date = content 358 quzhou.save_content(f'{path}\衢州', title, content, date) 359 print(title, date, articleUrl) 360 361 362 # 舟山 363 def runZhouShan(path): 364 homeUrl = 'https://zj.zjol.com.cn/local?id=53845c65e4b08e9fb1cdfce7' 365 zhoushan = Spider(url=homeUrl) 366 homePageSource = zhoushan.run() 367 articleUrls = zhoushan.parseHomePage(htmlSource=homePageSource) 368 369 for articleUrl in articleUrls: 370 content = zhoushan.paseArticlePage(articleUrl) 371 if content != None: 372 title, content, date = content 373 zhoushan.save_content(f'{path}\舟山', title, content, date) 374 print(title, date, articleUrl) 375 376 377 # 台州 378 def runTaiZhou(path): 379 homeUrl = 'https://zj.zjol.com.cn/local?id=53845c96e4b08e9fb1cdfcec' 380 taizhou = Spider(url=homeUrl) 381 homePageSource = taizhou.run() 382 articleUrls = taizhou.parseHomePage(htmlSource=homePageSource) 383 384 for articleUrl in articleUrls: 385 content = taizhou.paseArticlePage(articleUrl) 386 if content != None: 387 title, content, date = content 388 taizhou.save_content(f'{path}\台州', title, content, date) 389 print(title, date, articleUrl) 390 391 392 # 丽水 来源 393 def runLiShui(path): 394 homeUrl = 'https://zj.zjol.com.cn/local?id=53845cd2e4b08e9fb1cdfcf0' 395 lishui = Spider(url=homeUrl) 396 homePageSource = lishui.run() 397 articleUrls = lishui.parseHomePage(htmlSource=homePageSource) 398 399 for articleUrl in articleUrls: 400 content = lishui.paseArticlePage(articleUrl) 401 if content != None: 402 title, content, date = content 403 lishui.save_content(f'{path}\丽水', title, content, date) 404 print(title, date, articleUrl) 405 406 407 # 义乌 408 def runYiWu(path): 409 homeUrl = 'https://zj.zjol.com.cn/local?id=5428f31b498e0d3c0109194e' 410 tianxia = Spider(url=homeUrl) 411 homePageSource = tianxia.run() 412 articleUrls = tianxia.parseHomePage(htmlSource=homePageSource) 413 414 for articleUrl in articleUrls: 415 content = tianxia.paseArticlePage(articleUrl) 416 if content != None: 417 title, content, date = content 418 tianxia.save_content(f'{path}\义乌', title, content, date) 419 print(title, date, articleUrl) 420 421 422 # 视频 423 def runShiPin(path): 424 homeUrl = 'https://zj.zjol.com.cn/?id=57d690e7e200b20fbb4af09f' 425 shipin = Spider(url=homeUrl) 426 homePageSource = shipin.run() 427 articleUrls = shipin.parseHomePage(htmlSource=homePageSource) 428 429 for articleUrl in articleUrls: 430 content = shipin.paseArticlePage(articleUrl) 431 if content != None: 432 title, content, date = content 433 shipin.save_content(f'{path}\视频', title, content, date) 434 print(title, date, articleUrl) 435 436 437 def runAll(): 438 column_list = [runTouTiao, runTianXia, runZheJiang, runZhanYi, runGuanDian, runShengHuo, runTuPian, runHangZhou, 439 runNingBo, runWenZhou, runHuZhou, runJiaXing, runShaoXing, runJinHua, runQuZhou, runZhouShan, 440 runTaiZhou, runLiShui, runYiWu, runShiPin] 441 442 with ThreadPoolExecutor(max_workers=5) as t: 443 for column in column_list: 444 t.submit(column, 'E:\data\zjxww') 445 446 # for i in range(10): 447 # time.sleep(1) 448 # print(i) 449 450 451 def start(minutes=0): 452 flg = minutes 453 schedule.every(minutes).minutes.do(runAll) 454 while True: 455 if minutes != 0: 456 print(f'等待{minutes}分钟') 457 else: 458 minutes = flg 459 schedule.run_pending() 460 461 462 time.sleep(60) 463 minutes = minutes - 1 464 465 466 def exists(): 467 column_zh_list = ['头条', '天下', '浙江', '战疫', '观点', '生活', '图片', '杭州', '宁波', '温州', 468 '湖州', '嘉兴', '绍兴', '金华', '衢州', '舟山', '台州', '丽水', '义乌', '视频', ] 469 470 if not os.path.exists('E:\data\zjxww'): 471 os.mkdir('E:\data\zjxww') 472 for i in column_zh_list: 473 filedspath = f'E:\data\zjxww\{i}' 474 os.mkdir(filedspath) 475 else: 476 for i in column_zh_list: 477 filedspath = f'E:\data\zjxww\{i}' 478 if not os.path.exists(filedspath): 479 os.mkdir(filedspath) 480 481 482 if __name__ == "__main__": 483 if len(sys.argv) == 2: 484 exists() 485 minutes = int(sys.argv[1]) 486 start(minutes) 487 if len(sys.argv) == 3: 488 exists() 489 flg = int(sys.argv[2]) 490 if flg == 1: 491 runAll() 492 minutes = int(sys.argv[1]) 493 start(minutes) 494 495 # runHuZhou('E:\data\zjxww')