zoukankan      html  css  js  c++  java
  • <爬虫>常见网址的爬虫整理

    001.百度贴吧

    # 是告诉操作系统执行这个脚本的时候,调用/usr/bin下的python3解释器;
    # !/usr/bin/python3
    # -*- coding: utf-8 -*-
    
    
    """
    请求URL分析	https://tieba.baidu.com/f?kw=魔兽世界&ie=utf-8&pn=50
    请求方式分析	GET
    请求参数分析	pn每页50发生变化,其他参数固定不变
    请求头分析	只需要添加User-Agent
    """
    
    # 代码实现流程
    # 1. 实现面向对象构建爬虫对象
    # 2. 爬虫流程四步骤
    # 2.1 获取url列表
    # 2.2 发送请求获取响应
    # 2.3 从响应中提取数据
    # 2.4 保存数据
    
    import requests
    
    
    class TieBa_Spier():
    
    	def __init__(self, max_page, kw):
    		# 初始化
    		self.max_page = max_page  # 最大页码
    		self.kw = kw  # 贴吧名称
    		self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
    		self.headers = {
    			"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
    		}
    
    	def get_url_list(self):
    		"""获取url列表"""
    		# 根据pn每50进入下一页,构建url列表
    		return [self.base_url.format(self.kw, pn) for pn in range(0, self.max_page * 50, 50)]
    
    	def get_content(self, url):
    		"""发送请求获取响应内容"""
    		response = requests.get(
    			url=url,
    			headers=self.headers
    		)
    		# print(response.text)
    		return response.content
    
    	def save_items(self, content, idx):
    		"""从响应内容中提取数据"""
    		with open('{}.html'.format(idx), 'wb') as f:
    			f.write(content)
    		return None
    
    	def run(self):
    		"""运行程序"""
    		# 获取url_list
    		url_list = self.get_url_list()
    		for url in url_list:
    			# 发送请求获取响应
    			content = self.get_content(url)
    
    			# 保存数据,按照url的索引+1命名保存的文件
    			items = self.save_items(content, url_list.index(url) + 1)
    
    			# 测试
    			# print(items)
    
    
    if __name__ == '__main__':
    	# 最大页码,贴吧名
    	spider = TieBa_Spier(2, "神无月")
    	spider.run()
    

     002.京东商品评论

    # !/usr/bin/python3
    # -*- coding: utf-8 -*-
    import requests
    import re
    import pandas as pd
    
    """
    请求URL分析	https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4962&productId=5089225&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1
    请求方式分析	GET
    请求参数分析	page每页加1发生变化,其他参数固定不变
    请求头分析	不需要添加User-Agent
    """
    
    
    # 代码实现流程
    # 1. 实现面向对象构建爬虫对象
    # 2. 爬虫流程四步骤
    # 2.1 获取url列表
    # 2.2 发送请求获取响应
    # 2.3 从响应中提取数据
    # 2.4 保存数据
    
    
    class JD_Spier():
    
    	def __init__(self, max_page):
    		# 初始化
    		self.max_page = max_page  # 最大页码
    		self.base_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4962&productId=5089225&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1"
    
    	def get_url_list(self):
    		"""获取url列表"""
    		# 根据page每1进入下一页,构建url列表
    		return [self.base_url.format(page) for page in range(0, self.max_page, 1)]
    
    	def get_content(self, url):
    		"""发送请求获取响应内容"""
    		response = requests.get(url=url)
    		# print(response.text)
    		return response.text
    
    	def save_items(self, content):
    		"""从响应内容中提取数据"""
    		with open('comment_iphone11.txt', 'a', encoding='utf-8') as f:
    			pat = '"content":"(.*?)","'
    			res = re.findall(pat, content)
    			for index, i in enumerate(res):
    				i = i.replace('\n', '')
    				# print(i)
    				f.write(str(index) + ':' + i)
    				f.write('
    ')
    			f.write('
    ')
    		return None
    
    	def run(self):
    		"""运行程序"""
    		# 获取url_list
    		url_list = self.get_url_list()
    		for index, url in enumerate(url_list):
    			# 发送请求获取响应
    			try:
    				print('正在爬第%s页...' % index)
    				content = self.get_content(url)
    				# 保存数据
    				self.save_items(content)
    			except:
    				print('爬取第' + str(index) + '页出现问题')
    				continue
    
    
    if __name__ == '__main__':
    	# 最大页码
    	spider = JD_Spier(99)
    	spider.run()

    顺带做个词云图 

    from os import path
    from scipy.misc import imread
    import matplotlib.pyplot as plt
    import jieba
    from wordcloud import WordCloud
    
    # 进行分词的数据
    f = open('comment_iphone11.txt','r',encoding='utf-8')
    text = f.read()
    
    cut_text = ' '.join(jieba.lcut(text))
    print(cut_text)
    # 词云形状
    color_mask = imread("201910051325286.jpg")
    cloud = WordCloud(
        # 注意字体在同路径
        font_path='FZMWFont.ttf',  # 字体最好放在与脚本相同的目录下,而且必须设置
        background_color='white',
        mask=color_mask,
        max_words=200,
        max_font_size=5000
    )
    word_cloud = cloud.generate(cut_text)
    plt.imshow(word_cloud)
    plt.axis('off')
    plt.show()  

    效果图

    003.豆瓣电影top250(三种解析)

    # 目标:爬取豆瓣电影排行榜TOP250的电影信息
    # 信息包括:电影名字,上映时间,主演,评分,导演,一句话评价
    # 解析用学过的几种方法都实验一下①正则表达式.②BeautifulSoup③xpath
    import requests
    import re  # 正则表达式
    import json
    from bs4 import BeautifulSoup  # BS
    from lxml import etree  # xpath
    # 进程池
    from multiprocessing import Pool
    import multiprocessing
    
    
    
    def get_one_page(url):
    	headers = {
    		"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
    	}
    	response = requests.get(url, headers=headers)
    	if response.status_code == 200:
    		return response.text
    	return None
    
    
    def zhengze_parse(html):
    	pattern = re.compile(
    		'<em class="">(.*?)</em>.*?<img.*?alt="(.*?)".*?src="(.*?)".*?property="v:average">(.*?)</span>.*?<span>(.*?)</span>.*?'
    		+ 'class="inq">(.*?)</span>', re.S)
    	items = re.findall(pattern, html)
    	# print(items)
    	# 因为125个影片没有描述,根本没有匹配到- -,更改也简单,描述单独拿出来,这里我就不改了
    	for item in items:
    		yield {
    			'index': item[0],
    			'title': item[1],
    			'image': item[2],
    			'score': item[3],
    			'people': item[4].strip()[:-2],
    			'Evaluation': item[5]
    		}
    
    
    def soup_parse(html):
    	soup = BeautifulSoup(html, 'lxml')
    	for data in soup.find_all('div', class_='item'):
    		index = data.em.text
    		image = data.img['src']
    		title = data.img['alt']
    		people = data.find_all('span')[-2].text[:-2]
    		score = data.find('span', class_='rating_num').text
    		# 第125个影片没有描述,用空代替
    		if data.find('span', class_='inq'):
    			Evaluation = data.find('span', class_='inq').text
    		else:
    			Evaluation = ''
    		yield {
    			'index': index,
    			'image': image,
    			'title': title,
    			'people': people,
    			'score': score,
    			'Evaluation': Evaluation,
    		}
    
    
    def xpath_parse(html):
    	html = etree.HTML(html)
    	for data in html.xpath('//ol[@class="grid_view"]/li'):
    		index = data.xpath('.//em/text()')[0]
    		image = data.xpath('.//a/img/@src')[0]
    		title = data.xpath('.//a/img/@alt')[0]
    		people = data.xpath('.//div[@class="star"]/span[4]/text()')[0][:-2]
    		score = data.xpath('.//div[@class="star"]/span[2]/text()')[0]
    		# 第125个影片没有描述,用空代替
    		if data.xpath('.//p[@class="quote"]/span/text()'):
    			Evaluation = data.xpath('.//p[@class="quote"]/span/text()')[0]
    		else:
    			Evaluation = ''
    		yield {
    			'index': index,
    			'image': image,
    			'title': title,
    			'people': people,
    			'score': score,
    			'Evaluation': Evaluation,
    		}
    
    
    def write_to_file(content, flag):
    	with open('豆瓣电影TOP250(' + str(flag) + ').txt', 'a', encoding='utf-8')as f:
    		f.write(json.dumps(content, ensure_ascii=False) + '
    ')
    
    
    def search(Num):
    	url = 'https://movie.douban.com/top250?start=' + str(Num)
    	html = get_one_page(url)
    	for item in zhengze_parse(html):
    		write_to_file(item, '正则表达式')
    	for item in soup_parse(html):
    		write_to_file(item, 'BS4')
    	for item in xpath_parse(html):
    		write_to_file(item, 'xpath')
    	page = str(Num / 25 + 1)
    	print("正在爬取第" + page[:-2] + '页')
    
    
    def main():
    	pool = Pool()
    	pool.map(search, [i * 25 for i in range(10)])
    	# # 提供页码--不用进程池
    	# for i in range(0, 10):
    	# 	Num = i * 25
    	# 	search(Num)
    	print("爬取完成")
    
    
    if __name__ == '__main__':
    	# 打包之后,windows执行多进程出错,需要加入这一行
    	multiprocessing.freeze_support()
    	# 入口
    	main() 

    打包成exe可执行文件

    pyinstaller -F 豆瓣电影排行.py

    运行效果

     004.今日头条(街拍美图)

    # 拼接URL
    from urllib.parse import urlencode
    # 请求URL
    import requests
    # 文件操作
    import os
    # md5:类似加密,不会重复
    from hashlib import md5
    # 进程池
    from multiprocessing.pool import Pool
    # 延迟
    import time
    
    base_url = 'https://www.toutiao.com/api/search/content/?'
    
    headers = {
    	'Referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
    	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
    	'X-Requested-With': 'XMLHttpRequest',
    }
    
    
    def get_page(offset):
    	# https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis
    	# 根据链接传入params,offset是变化的
    	params = {
    		'aid': '24',
    		'app_name': 'web_search',
    		'offset': offset,
    		'format': 'json',
    		'keyword': '街拍',
    		'autoload': 'ture',
    		'count': '20',
    		'en_qc': '1',
    		'cur_tab': '1',
    		'from': 'search_tab',
    		'pd': 'synthesis',
    	}
    	url = base_url + urlencode(params)
    	# 返回json格式的数据
    	try:
    		response = requests.get(url, headers=headers)
    		if response.status_code == 200:
    			return response.json()
    	except requests.ConnectionError as e:
    		print('Error', e.args)
    
    
    def get_images(json):
    	if json:
    		items = json.get('data')
    		for item in items:
    			# 标题
    			title = item.get('title')
    			# 图片列表
    			images = item.get('image_list')
    			for image in images:
    				# 返回单个图片链接+标题的字典
    				yield {
    					'image': image.get('url'),
    					'title': title,
    				}
    
    
    def save_image(item):
    	# 如果没有文件夹就创建文件夹
    	dirs = 'F:\domo'
    	if not os.path.exists(dirs):
    		os.mkdir("F:\domo")
    	# 改变当前工作目录
    	os.chdir('F:\domo')
    	# 如果没有item传过来title命名的文件,就创建一个
    	if not os.path.exists(item.get('title')):
    		os.mkdir(item.get('title'))
    	try:
    		# 请求图片URL
    		response = requests.get(item.get('image'))
    		if response.status_code == 200:
    			# 构造图片名字
    			file_path = '{0}\{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
    			# 如果不存在这张图片就以二进制方式写入
    			if not os.path.exists(file_path):
    				with open(file_path, 'wb') as f:
    					f.write(response.content)
    			else:
    				print("已经下载过这个文件了", file_path)
    	except:
    		print("图片下载失败")
    
    
    GROUP_START = 1
    GROUP_END = 20
    
    
    def main(offset):
    	json = get_page(offset)
    	for item in get_images(json):
    		print(item)
    		save_image(item)
    
    
    if __name__ == '__main__':
    	pool = Pool()
    	# 构造一个offset列表 20-400(20页)
    	groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    	# 多进程运行main函数
    	pool.map(main, groups)
    	# 关闭进程池
    	pool.close()
    	# 等待还没运行完的进程
    	pool.join()

    爬10页左右就不给数据了,需要添加UA池

    总结:1.os模块的基本操作

        os.chdir('路径') --------------------表示改变当前工作目录到路径

        os.path.exists('文件名') ------------当前目录下是否存在该文件,存在返回Ture,不存在返回False

        os.mkdir()-----------创建文件夹

      2. 用MD5值命名文件,可以有效的解决重复抓取的问题

      3.进程池能大大降低爬取时间

    005.微博

    # url拼接
    from urllib.parse import urlencode
    # 去掉html标签
    from pyquery import PyQuery as pq
    # 请求
    import requests
    # 链接mongo
    from pymongo import MongoClient
    # 爬的太快大概36页的时候就会出现418,加点延迟吧
    import time
    
    # 连接
    client = MongoClient()
    # 指定数据库
    db = client['weibo']
    # 指定表
    collection = db['weibo_domo2']
    
    max_page = 100
    
    
    # 存储到mongoDB
    def save_to_mongo(result):
    	if collection.insert(result):
    		print("saved to mongo")
    
    
    # https://m.weibo.cn/api/container/getIndex?containerid=1076032830678474&page=2
    # 找到X-Requested-With: XMLHttpRequest的Ajax请求
    # 基础url,之后利用urlencode进行拼接
    base_url = 'https://m.weibo.cn/api/container/getIndex?'
    
    # https://m.weibo.cn/api/container/getIndex?type=uid&value=1005052830678474&containerid=1005051005052830678474
    headers = {
    	'host': 'm.weibo.cn',
    	# 手机端打开,查到链接,在解析
    	# 'Referer': 'https://m.weibo.cn/p/1005052830678474',
    	'Referer': 'https://m.weibo.cn/u/2202323951',
    	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
    	'X-Requested-With': 'XMLHttpRequest',
    }
    
    
    def get_page(page):
    	params = {
    		'type': 'uid',
    		'value': '2202323951',
    		# 'containerid': '1076032830678474',
    		'containerid': '1076032202323951',
    		'page': page,
    	}
    	url = base_url + urlencode(params)
    	print(url)
    	try:
    		response = requests.get(url, headers=headers)
    		if response.status_code == 200:
    			# response = json.dump(response.text)
    			return response.json(), page
    	except requests.ConnectionError as e:
    		print('Error', e.args)
    
    
    def parse_page(json, page: int):
    	if json:
    		# 只需要data下的cards内的数据
    		items = json.get('data').get('cards')
    		# index 下标
    		for index, item in enumerate(items):
    			# 在第一页,index==1没有mblog,只有这个没用,所以直接循环会导则索引报错
    			# 跳过这段
    			if index == 1 and page == 1:
    				continue
    			else:
    				item = item.get('mblog')
    				weibo = {}
    				# 微博ID
    				# "id":"4349509976406880",
    				weibo['ID'] = item.get('id')
    				# 微博内容 使用pq去掉html标签
    				weibo['text'] = pq(item.get('text')).text()
    				# 发表所用手机
    				weibo['phone'] = item.get('source')
    				# 发表时间
    				weibo['time'] = item.get('edit_at')
    				# 赞数量 attitudes:态度,意思,姿态
    				weibo['attitudes'] = item.get('attitudes_count')
    				# 评论数 comment:评论
    				weibo['comments'] = item.get('comments_count')
    				# 转发数 repost:转帖
    				weibo['reposts'] = item.get('reposts_count')
    				yield weibo
    
    
    if __name__ == '__main__':
    	for page in range(1, max_page + 1):
    		json = get_page(page)
    		# *json==*args 将返回的json和page传入
    		results = parse_page(*json)
    		time.sleep(3)
    		for result in results:
    			print(result)
    			save_to_mongo(result) 

    总结:

      1.不加延迟爬到36-38页会出现418  (418 I’m a teapot 服务器拒绝尝试用 “茶壶冲泡咖啡”。)

      2. Ajax请求中可能在中间出现不是你想要的数据,例如微博page1,index1代表的是关注列表,关注的信息,不是你想要的数据

      3.使用手机端获取Ajax数据,比在PC端,容易很多.

      4.启动mongo需要先指定dbpath(数据存储的地方),查询插入文件的数量

        形如:mongod --dbpath="F:MongoDBServer3.4data"

        形如: db.weibo_domo2.find().count()

      5.最终爬取出了朱子奇的所有微博,一共959条

    006.猫眼电影top100

    https://www.cnblogs.com/shuimohei/p/10400814.html

    007.百度百科

    https://www.cnblogs.com/shuimohei/p/10339891.html

    008.斗鱼直播

    '''
    Ajax含有很多加密参数,我们无法直接进行爬取,只能借助Selenium
    '''
    # !/usr/bin/env python
    # -*- coding:utf-8 -*-
    import unittest
    from selenium import webdriver
    from bs4 import BeautifulSoup as bs
    import time
    
    
    class douyu(unittest.TestCase):
    	# 初始化方法,必须是setUp()
    	def setUp(self):
    		# self.driver = webdriver.Chrome()
    		self.driver = webdriver.PhantomJS()
    		self.num = 0
    		self.count = 0
    
    	# 测试方法必须有test字样开头
    	def testDouyu(self):
    		self.driver.get("https://www.douyu.com/directory/all")
    
    		while True:
    			soup = bs(self.driver.page_source, "lxml")
    			# 房间名, 返回列表
    			names = soup.find_all("h3", {"class": "DyListCover-intro"})
    			# 直播间热度, 返回列表
    			numbers = soup.find_all("span", {"class": "DyListCover-hot"})
    			print(names,numbers)
    			for name, number in zip(names, numbers):
    				self.num += 1
    				print(
    					u"直播间热度: -" + number.get_text().strip() + u"-	房间名: " + name.get_text().strip() + u'-	直播数量' + str(
    						self.num))
    				result = u"直播间热度: -" + number.get_text().strip() + u"-	房间名: " + name.get_text().strip() + u'-	直播数量' + str(
    					self.num)
    				with open('123.txt', 'a', encoding='utf-8') as f:
    					f.write(result)
    
    			# self.count += int(number.get_text().strip())
    
    			# 如果在页面源码里找到"下一页"为隐藏的标签,就退出循环
    			if self.driver.page_source.find("dy-Pagination-disabled dy-Pagination-next") != -1:
    				break
    
    			#网络不好,加个延时,也可以考虑用直到标签能够点击的判断
    			time.sleep(1)
    			# 一直点击下一页
    			self.driver.find_element_by_class_name("dy-Pagination-next").click()
    			time.sleep(1)
    
    	# 测试结束执行的方法
    	def tearDown(self):
    		# 退出PhantomJS()浏览器
    		print("当前网站直播人数" + str(self.num))
    		print("当前网站总热度" + str(self.count))
    		self.driver.quit()
    
    
    if __name__ == "__main__":
    	# 启动测试模块
    	unittest.main()
    

    selenium还是慢了点,加了延时后更慢了

     009.阳光热线问政平台

    1.创建项目

    scrapy startproject dongguan

    2.创建爬虫

    scrapy genspider -t crawl sun  wz.sun0769.com

    3.items.py  

    import scrapy
    
    
    class DongguanItem(scrapy.Item):
        # define the fields for your item here like:
        title = scrapy.Field()
        data = scrapy.Field()
        num = scrapy.Field()
        content = scrapy.Field()
        url = scrapy.Field()

    4.sun.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from dongguan.items import DongguanItem
    
    
    class SunSpider(CrawlSpider):
    	name = 'sun'
    	allowed_domains = ['wz.sun0769.com']
    	start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1']
    	rules = (
    		# 翻页
    		Rule(LinkExtractor(allow=r'page=d+'), follow=True),
    		# 每个链接的
    		Rule(LinkExtractor(allow=r'id=d+'), callback='parse_item', follow=False),
    	)
    
    	def parse_item(self, response):
    		print(response.url)
    		print(response)
    		item = DongguanItem()
    		item['title'] = response.xpath('//p[@class="focus-details"]/text()').extract_first()
    		item['data'] = response.xpath('//span[@class="fl"]/text()').extract()[0][4:]
    		item['num'] = response.xpath('//span[@class="fl"]/text()').extract()[2][3:]
    		# normalize-space,xpath中去掉
    	
    
    		item['content'] = response.xpath('normalize-space(//div[@class="details-box"]/pre/text())').extract_first()
    		item['url'] = response.url
    
    		yield item 

    5.pipelines.py

    import json
    
    
    class DongguanPipeline(object):
    
    	def __init__(self):
    		self.filename = open('dongguan.txt', 'wb')
    
    	def process_item(self, item, spider):
    		text = json.dumps(dict(item), ensure_ascii=False) + '
    '
    		self.filename.write(text.encode('utf-8'))
    		return item
    
    	def close_spider(self, spider):
    		self.filename.close()

    6.settings.py

    ROBOTSTXT_OBEY = False
    
    ITEM_PIPELINES = {
       'dongguan.pipelines.DongguanPipeline': 300,
    }
    
    
    # 日志文件名和处理等级
    LOG_FILE = "dg.log"
    LOG_LEVEL = "DEBUG"

    7.运行爬虫 

    scrapy crawl sun

    8.运行效果  

    010.新浪网分类资讯整站爬虫

    1.创建项目

    scrapy startproject sina
    

    2.创建爬虫

    scrapy genspider xinlang sina.com.cn
    

      

    3.items.py 

    # -*- coding: utf-8 -*-
    
    import scrapy
    import sys, importlib
    
    importlib.reload(sys)
    
    
    class SinaItem(scrapy.Item):
    	# 第一层:大类的标题 和 url
    	parentTitle = scrapy.Field()
    	parentUrls = scrapy.Field()
    
    	# 第二层:小类的标题 和 子url
    	subTitle = scrapy.Field()
    	subUrls = scrapy.Field()
    
    	# 存储到本地:小类目录存储路径
    	subFilename = scrapy.Field()
    
    	# 第三层:小类下的子链接
    	sonUrls = scrapy.Field()
    
    	# 抓到数据:文章标题和内容
    	head = scrapy.Field()
    	content = scrapy.Field()
    

      

    4.xinlang.py----新闻的解析方式太多了,没有写完全

    # -*- coding: utf-8 -*-
    import scrapy
    # 创建文件夹
    import os
    from sina.items import SinaItem
    
    
    class XinlangSpider(scrapy.Spider):
    	name = 'xinlang'
    	allowed_domains = ['sina.com.cn']
    	start_urls = ['http://news.sina.com.cn/guide/']
    
    	def parse(self, response):
    		items = []
    		# 用xpath找出所有大类的URL和标题 19个
    		parentUrls = response.xpath('//div[@id="tab01"]/div/h3/a/@href').extract()
    		parentTitle = response.xpath('//div[@id="tab01"]/div/h3/a/text()').extract()
    
    		# 找出所有小类的ur 和 标题 299个
    		subUrls = response.xpath('//div[@id="tab01"]/div/ul/li/a/@href').extract()
    		subTitle = response.xpath('//div[@id="tab01"]/div/ul/li/a/text()').extract()
    
    		# 爬取所有大类
    		for i in range(0, len(parentTitle)):
    			# 指定大类目录的路径和目录名
    			parentFilename = "./Data/" + parentTitle[i]
    
    			# 如果目录不存在,则创建目录
    			if (not os.path.exists(parentFilename)):
    				os.makedirs(parentFilename)
    
    			# 爬取所有小类
    			for j in range(0, len(subUrls)):
    				item = SinaItem()
    
    				# 保存大类的title和urls
    				item['parentTitle'] = parentTitle[i]
    				item['parentUrls'] = parentUrls[i]
    
    				# 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
    				if_belong = subUrls[j].startswith(item['parentUrls'])
    
    				# 如果属于本大类,将存储目录放在本大类目录下
    				if (if_belong):
    					subFilename = parentFilename + '/' + subTitle[j]
    					# 如果目录不存在,则创建目录
    					if (not os.path.exists(subFilename)):
    						os.makedirs(subFilename)
    
    					# 存储 小类url、title和filename字段数据
    					item['subUrls'] = subUrls[j]
    					item['subTitle'] = subTitle[j]
    					item['subFilename'] = subFilename
    
    					items.append(item)
    
    			# 发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理
    			for item in items:
    				yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)
    
    	# 对于返回的小类的url,再进行递归请求
    	def second_parse(self, response):
    		# 提取每次Response的meta数据
    		meta_1 = response.meta['meta_1']
    
    		# 取出小类里所有子链接,只要a标签下的链接
    		sonUrls = response.xpath('//a/@href').extract()
    
    		items = []
    		for i in range(0, len(sonUrls)):
    			# 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True,确保是个新闻
    			if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])
    
    			# 如果属于本大类,获取字段值放在同一个item下便于传输
    			if (if_belong):
    				item = SinaItem()
    				item['parentTitle'] = meta_1['parentTitle']
    				item['parentUrls'] = meta_1['parentUrls']
    				item['subUrls'] = meta_1['subUrls']
    				item['subTitle'] = meta_1['subTitle']
    				item['subFilename'] = meta_1['subFilename']
    				item['sonUrls'] = sonUrls[i]
    				items.append(item)
    
    		# 发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理
    		for item in items:
    			yield scrapy.Request(url=item['sonUrls'], meta={'meta_2': item}, callback=self.detail_parse)
    
    	# 数据解析方法,获取文章标题和内容
    	def detail_parse(self, response):
    		item = response.meta['meta_2']
    		content = ""
    		head = response.xpath('//h1[@class="main-title"]/text()').extract()
    		content_list = response.xpath('//div[@class="article"]/p/text()').extract()
    		# 如果新闻的类型没有匹配到
    		if len(content_list) < 1:
    			# 按照新闻中心的匹配http://news.sina.com.cn/w/2004-12-20/11314575163s.shtml
    			head = response.xpath('//th[@class="f24"]//h1/text()').extract()
    			content_list = response.xpath('//td[@class="l17"]/font/p/text()').extract()
    		if len(content_list) < 1:
    			# http://news.sina.com.cn/c/2012-09-21/092225223127.shtml
    			head = response.xpath('//div[@class="blk_content"]/h1/text()').extract()
    			content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
    		if len(content_list) < 1:
    			# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
    			head = response.xpath('//h1[@id="artibodyTitle"]/text()').extract()
    			content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
    		if len(content_list) < 1:
    			# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
    			head = response.xpath('//h1[@class="main-title"]/text()').extract()
    			content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
    		if len(content_list) < 1:
    			# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
    			head = response.xpath('//h1[@id="artibodyTitle"]/font/text()').extract()
    			content_list = response.xpath('//div[@id="artibody"]//span/text()').extract()
    
    		if len(head) < 1:
    			# 漏网只鱼
    			head = ['error']
    			content_list = [response.url]
    		# 将p标签里的文本内容合并到一起
    		for content_one in content_list:
    			content += content_one
    
    		item['head'] = head
    		item['content'] = content
    
    		yield item
    

      

    5.pipelines.py

    import json
    from scrapy import signals
    
    
    class SinaPipeline(object):
    
    	def process_item(self, item, spider):
    		sonUrls = item['sonUrls']
    
    		# 文件名为子链接url中间部分,并将 / 替换为 _,保存为 .txt格式
    		filename = sonUrls[7:-6].replace('/', '_')
    		filename += ".txt"
    
    		fp = open(item['subFilename'] + '/' + filename, 'w', encoding='utf-8')
    		fp.write(item['content'])
    		fp.close()
    
    		return item
    

    6. settings.py 

    BOT_NAME = 'sina'
    
    SPIDER_MODULES = ['sina.spiders']
    NEWSPIDER_MODULE = 'sina.spiders'
    
    
    ROBOTSTXT_OBEY = False
    
    
    
    DOWNLOAD_DELAY = 0.5
    
    ITEM_PIPELINES = {
       'sina.pipelines.SinaPipeline': 300,
    }
    
    
    
    # 日志文件名和处理等级
    LOG_FILE = "dg.log"
    LOG_LEVEL = "DEBUG"
    

      

    7.main.py

    在项目根目录下新建main.py文件,用于调试

    from scrapy import cmdline
    
    cmdline.execute('scrapy crawl xinlang'.split())
    

      

    8.执行程序

    运行main.py文件即可

    9.效果

    能爬一部分新闻,不够完善

    请求成功次数:4416

    最大深度:2

      

  • 相关阅读:
    ES 6 系列
    ES 6 系列
    EChart.js 笔记二
    EChart.js 笔记一
    图像阈值_有cv2.threshold,cv2.adaptiveThreshold 等。
    几何变换——放大、镜像、平移、旋转、透视、仿射
    颜色空间转换---追踪物体
    图像算术运算——相加、相减、与、或、异或、非
    javascript中json对象与字符串互转及取值
    爬虫:Selenium + PhantomJS
  • 原文地址:https://www.cnblogs.com/shuimohei/p/13323800.html
Copyright © 2011-2022 走看看