抓取猫眼电影前100

zoukankan html css js c++ java

抓取猫眼电影前100

import json

import requests

import re

import time

from requests.exceptions import RequestException

def get_one_page(url):

 try:

 headers = {

 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'

 }

 response = requests.get(url, headers=headers)

 if response.status_code == 200:

 print(response.text)

 return response.text # 使得get_one_page()函数输出是一个文本

 return None

 except RequestException:

 return None

def parse_one_page(html):

 pattern = re.compile(

 '<dd>.*?board-index.*?>(.*?).*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?).*?releasetime.*?>(.*?).*?'

 'integer.*?>(.*?).*?fraction.*?>(.*?).*?</dd>',

 re.S) # 正则表达式获取需要保存的东西编译成正则表达式对象

 items = re.findall(pattern, html) # 遍历html文件中的所有pattern正则表达式对象

 for item in items: # 把提取的对象装入字典中

 yield {

 'index': item[0],

 'title': item[1],

 'actor': item[2].strip()[3:],

 'time': item[3].strip()[5:],

 'score': item[4] + item[5]

 }

def write_to_file(content): # 把文件写入并保存在result.tx + ' ')

 with open('result.txt', 'a', encoding='utf-8') as f:

 f.write(json.dumps(content, ensure_ascii=False) + ' ')

def main(offset): # 遍历TOP100的电影的所有网址

 url = 'http://maoyan.com/board/4?offset=' + str(offset) # 接收一个偏移量offset

 html = get_one_page(url)

 for item in parse_one_page(html):

 print(item)

 write_to_file(item)

if __name__ == '__main__': # 创建一个偏移量offset

 for i in range(10):

 main(offset=i * 10)

 time.sleep(1)

查看全文

相关阅读:
【NOIP 2003】加分二叉树
 【POJ 1655】 Balancing Act
【HDU 3613】Best Reward
【POJ 3461】 Oulipo
【POJ 2752】 Seek the Name, Seek the Fame
【POJ 1961】 Period
【POJ 2406】 Power Strings
BZOJ3028 食物（生成函数）
BZOJ5372 PKUSC2018神仙的游戏（NTT）
BZOJ4836 二元运算（分治FFT）

原文地址：https://www.cnblogs.com/jestin/p/12911360.html