Requests+正则表达式爬取电影
1、目标站点分析
- 抓取单页内容:利用requests请求目标站点,得到单个网页HTML代码,返回结果
- 正则表达式分析:根据HTML代码分析得到电影的名称、主演、上映时间、评分、图片链接等信息
- 保存至文件:通过文件的形式将结果保存,每一部电影一个结果一行json字符串
- 开启循环及多线程:对多页内容遍历,开启多线程提高抓取速度。
2、实战
import requests from requests.exceptions import RequestException from multiprocessing import Pool import re,json def get_one_page(url): try: headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } # 加headers非常必要,不加很可能会被禁掉 response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) items = re.findall(pattern,html) for item in items: yield { 'index':item[0], 'image':item[1], 'title':item[2], 'actor':item[3].strip()[3:], 'time':item[4].strip()[5:], 'score':item[5]+item[6] } def write_to_file(content): with open('result.txt','a',encoding='utf8') as f: f.write(json.dumps(content,ensure_ascii=False) + '\n') # content是字典的形式,需要用json进行转换 def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) for item in parse_one_page(html): # print(item) write_to_file(item) # 写到文件 if __name__ == '__main__': # for i in range(10): # main(i*10) pool = Pool() pool.map(main, [i*10 for i in range(10)]) # 利用多进程抓取,提高效率
抓取街拍美图实例
这个实例需要的库以及数据库有:请求库requests,解析库BeautifulSoup和正则,存储的数据库是MongoDB,需要pymongo库
1、流程框架:
- 抓取索引页内容:利用requests请求目标站点内容,得到索引网页HTML代码,返回结果
- 抓取详情页内容:解析返回结果,得到详情页的链接,并进一步抓取详情页的信息
- 下载图片与保存数据:将图片下载到本地,并把页面信息及图片URl保存到MongoDB
- 开启循环及多线程:对多页内容进行遍历,开启多线程提高抓取速度
2、实战
spider.py
import requests from requests.exceptions import RequestException from urllib.parse import urlencode import json,re from bs4 import BeautifulSoup import pymongo from config import * import os,hashlib from multiprocessing import Pool client = pymongo.MongoClient(MONGO_URL,connect=False) db = client[MONGO_DB] def get_page_index(offset,keyword): data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': 20, 'cur_tab': 1, 'from':'search_tab' } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) headers = { 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' } try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('请求索引页失败') return None def parse_page_index(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') def get_page_detail(url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' } try: if url: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('详情页出错',url) return None def parse_page_deatil(html,url): soup = BeautifulSoup(html,'lxml') title = soup.select('title')[0].get_text() # print(title) images_pattern = re.compile('JSON.parse\("(.*?)"\)',re.S) result = re.search(images_pattern, html) if result: data = result.group(1) data = data.replace('\\"', '"') data = json.loads(data) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images:download_image(image) return { 'title':title, 'url':url, 'images':images } def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('成功存储到MongoDB') return True return False def download_image(url): try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except RequestException: print('请求图片失败') return None def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(),hashlib.md5(content).hexdigest(),'.jpg') # 路径拼接 if not os.path.exists(file_path): with open(file_path,'wb') as f: f.write(content) def main(offset): html = get_page_index(offset,KEYWORD) for url in parse_page_index(html): html = get_page_detail(url) if html: result = parse_page_deatil(html,url) if result:save_to_mongo(result) if __name__ == '__main__': groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)] pool = Pool() pool.map(main,groups)
config.py:配置文件
MONGO_URL='localhost' MONGO_DB='toutiao' MONGO_TABLE='toutiao' GROUP_START=1 GROUP_END=20 KEYWORD='街拍'