zoukankan html css js c++ java

[Python爬虫]起点中文网小说排行榜

import requests
import re
import time
import json
from requests.exceptions import RequestException
def get_html_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
        }
        html = requests.get(url, headers=headers)
        if html.status_code==200:
            return html.text
        return None
    except RequestException:
        return None
def get_parse_page(html):
    pattern=re.compile('<li data-rid.*?>[sS.]*?<span class=.*?>(.*?)<cite>[sS.]*?<h4><a.*?data-bid=.*?>(.*?)</a>'
                       '</h4>[sS]*?<p class="author">[sS]*?<img.*?data-eid=.*?>(.*?)</a>',re.S)
    items=re.findall(pattern,html)
    for item in items:
        yield {
            'rank':item[0],
            'title':item[1],
            'author':item[2]
        }

#<h4><a.*?data-bid=.*?>(.*?)</a></h4>[sS]*?<p class="author">[sS]*?<a.*?target="_blank">(.*?)</a><em>.*?</span>[sS.]*?<p class="intro">[sS]*?(.*?)[sS]*?</p>

def write_to_file(content):
    with open('result.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'
')
def main(page):
    url='https://www.qidian.com/rank/yuepiao?page='+str(page)
    html=get_html_page(url)
    for con in get_parse_page(html):
        write_to_file(con)
if __name__=='__main__':
    for i in range(5):
        main(i+1)
        time.sleep(1)

查看全文

相关阅读:
哈夫曼编码拓展题
 TrieTree
并查集
 hash一致性
 布隆过滤器
 如何计算完全二叉树的结点数？
如何翻转单链表和双向链表
 如何判断是搜索二叉树与完全二叉树
 文本分类之特征描述vsm和bow
文本分类概述

原文地址：https://www.cnblogs.com/lightmonster/p/11556174.html