爬取电影网站
movie.py
import scrapy
from MyProjectDianying.items import MyprojectdianyingItem
class MovieSpider(scrapy.Spider):
name = 'movie'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.1905.com/vod/list/n_1_t_1/o3.html?fr=vodhome_js_lx']
url = 'https://www.1905.com/vod/list/n_1_t_1/o3p%d.html'
page = 2
def parse(self, response):
divs = response.xpath('//*[@id="content"]/section[4]/div')
for div in divs:
href = div.xpath('./a/@href')[0].extract()
title = div.xpath('./a/@title')[0].extract()
item = MyprojectdianyingItem()
item["href"] = href
item["title"] = title
print(title)
yield scrapy.Request(href, callback=self.parse_href, meta={'item': item})
if self.page < 4:
url = format(self.url % self.page)
yield scrapy.Request(url,callback=self.parse)
self.page += 1
def parse_href(self,response):
detail = response.xpath('//*[@id="playerBoxIntroCon"]/text()')[0].extract()
item = response.meta['item']
item["detail"] = detail
yield item
items.py
import scrapy
class MyprojectdianyingItem(scrapy.Item):
# define the fields for your item here like:
href = scrapy.Field()
title = scrapy.Field()
detail = scrapy.Field()
settings.py
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
ITEM_PIPELINES = {
'MyProjectDianying.pipelines.MyprojectdianyingPipeline': 300,
}
pipelines.py
class MyprojectdianyingPipeline:
fp = None
def open_spider(self,spider):
self.fp = open('dianying.txt', mode='w', encoding='utf-8')
def process_item(self, item, spider):
href = item["href"]
title = item["title"]
detail = item["detail"]
self.fp.write(title+href+detail+' ')
return item
def close_spider(self,spider):
self.fp.close()