import requests import re import time import json from requests.exceptions import RequestException def get_html_page(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36', } html = requests.get(url, headers=headers) if html.status_code==200: return html.text return None except RequestException: return None def get_parse_page(html): pattern=re.compile('<li data-rid.*?>[sS.]*?<span class=.*?>(.*?)<cite>[sS.]*?<h4><a.*?data-bid=.*?>(.*?)</a>' '</h4>[sS]*?<p class="author">[sS]*?<img.*?data-eid=.*?>(.*?)</a>',re.S) items=re.findall(pattern,html) for item in items: yield { 'rank':item[0], 'title':item[1], 'author':item[2] } #<h4><a.*?data-bid=.*?>(.*?)</a></h4>[sS]*?<p class="author">[sS]*?<a.*?target="_blank">(.*?)</a><em>.*?</span>[sS.]*?<p class="intro">[sS]*?(.*?)[sS]*?</p> def write_to_file(content): with open('result.txt','a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False)+' ') def main(page): url='https://www.qidian.com/rank/yuepiao?page='+str(page) html=get_html_page(url) for con in get_parse_page(html): write_to_file(con) if __name__=='__main__': for i in range(5): main(i+1) time.sleep(1)